Search in sources :

Example 1 with SlotNotFoundException

use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.

the class TaskExecutor method offerSlotsToJobManager.

// ------------------------------------------------------------------------
//  Internal job manager connection methods
// ------------------------------------------------------------------------
private void offerSlotsToJobManager(final JobID jobId) {
    final JobManagerConnection jobManagerConnection = jobManagerTable.get(jobId);
    if (jobManagerConnection == null) {
        log.debug("There is no job manager connection to the leader of job {}.", jobId);
    } else {
        if (taskSlotTable.hasAllocatedSlots(jobId)) {
            log.info("Offer reserved slots to the leader of job {}.", jobId);
            final JobMasterGateway jobMasterGateway = jobManagerConnection.getJobManagerGateway();
            final Iterator<TaskSlot> reservedSlotsIterator = taskSlotTable.getAllocatedSlots(jobId);
            final UUID leaderId = jobManagerConnection.getLeaderId();
            final Collection<SlotOffer> reservedSlots = new HashSet<>(2);
            while (reservedSlotsIterator.hasNext()) {
                SlotOffer offer = reservedSlotsIterator.next().generateSlotOffer();
                try {
                    if (!taskSlotTable.markSlotActive(offer.getAllocationId())) {
                        // the slot is either free or releasing at the moment
                        final String message = "Could not mark slot " + jobId + " active.";
                        log.debug(message);
                        jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
                    }
                } catch (SlotNotFoundException e) {
                    final String message = "Could not mark slot " + jobId + " active.";
                    jobMasterGateway.failSlot(getResourceID(), offer.getAllocationId(), leaderId, new Exception(message));
                    continue;
                }
                reservedSlots.add(offer);
            }
            Future<Iterable<SlotOffer>> acceptedSlotsFuture = jobMasterGateway.offerSlots(getResourceID(), reservedSlots, leaderId, taskManagerConfiguration.getTimeout());
            acceptedSlotsFuture.thenAcceptAsync(new AcceptFunction<Iterable<SlotOffer>>() {

                @Override
                public void accept(Iterable<SlotOffer> acceptedSlots) {
                    // check if the response is still valid
                    if (isJobManagerConnectionValid(jobId, leaderId)) {
                        // mark accepted slots active
                        for (SlotOffer acceptedSlot : acceptedSlots) {
                            reservedSlots.remove(acceptedSlot);
                        }
                        final Exception e = new Exception("The slot was rejected by the JobManager.");
                        for (SlotOffer rejectedSlot : reservedSlots) {
                            freeSlot(rejectedSlot.getAllocationId(), e);
                        }
                    } else {
                        // discard the response since there is a new leader for the job
                        log.debug("Discard offer slot response since there is a new leader " + "for the job {}.", jobId);
                    }
                }
            }, getMainThreadExecutor());
            acceptedSlotsFuture.exceptionallyAsync(new ApplyFunction<Throwable, Void>() {

                @Override
                public Void apply(Throwable throwable) {
                    if (throwable instanceof TimeoutException) {
                        // We ran into a timeout. Try again.
                        offerSlotsToJobManager(jobId);
                    } else {
                        // We encountered an exception. Free the slots and return them to the RM.
                        for (SlotOffer reservedSlot : reservedSlots) {
                            freeSlot(reservedSlot.getAllocationId(), throwable);
                        }
                    }
                    return null;
                }
            }, getMainThreadExecutor());
        } else {
            log.debug("There are no unassigned slots for the job {}.", jobId);
        }
    }
}
Also used : SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) SlotOffer(org.apache.flink.runtime.taskexecutor.slot.SlotOffer) TaskSlot(org.apache.flink.runtime.taskexecutor.slot.TaskSlot) JobMasterGateway(org.apache.flink.runtime.jobmaster.JobMasterGateway) TimeoutException(java.util.concurrent.TimeoutException) PartitionException(org.apache.flink.runtime.taskexecutor.exceptions.PartitionException) CheckpointException(org.apache.flink.runtime.taskexecutor.exceptions.CheckpointException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) TaskSubmissionException(org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException) TaskException(org.apache.flink.runtime.taskexecutor.exceptions.TaskException) SlotNotActiveException(org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException) SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) IOException(java.io.IOException) UUID(java.util.UUID) HashSet(java.util.HashSet) TimeoutException(java.util.concurrent.TimeoutException)

Example 2 with SlotNotFoundException

use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.

the class TaskExecutor method closeJobManagerConnection.

private void closeJobManagerConnection(JobID jobId, Exception cause) {
    log.info("Close JobManager connection for job {}.", jobId);
    // 1. fail tasks running under this JobID
    Iterator<Task> tasks = taskSlotTable.getTasks(jobId);
    while (tasks.hasNext()) {
        tasks.next().failExternally(new Exception("JobManager responsible for " + jobId + " lost the leadership."));
    }
    // 2. Move the active slots to state allocated (possible to time out again)
    Iterator<AllocationID> activeSlots = taskSlotTable.getActiveSlots(jobId);
    while (activeSlots.hasNext()) {
        AllocationID activeSlot = activeSlots.next();
        try {
            if (!taskSlotTable.markSlotInactive(activeSlot, taskManagerConfiguration.getTimeout())) {
                freeSlot(activeSlot, new Exception("Slot could not be marked inactive."));
            }
        } catch (SlotNotFoundException e) {
            log.debug("Could not mark the slot {} inactive.", jobId, e);
        }
    }
    // 3. Disassociate from the JobManager
    JobManagerConnection jobManagerConnection = jobManagerTable.remove(jobId);
    if (jobManagerConnection != null) {
        try {
            jobManagerHeartbeatManager.unmonitorTarget(jobManagerConnection.getResourceID());
            jobManagerConnections.remove(jobManagerConnection.getResourceID());
            disassociateFromJobManager(jobManagerConnection, cause);
        } catch (IOException e) {
            log.warn("Could not properly disassociate from JobManager {}.", jobManagerConnection.getJobManagerGateway().getAddress(), e);
        }
    }
}
Also used : SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) Task(org.apache.flink.runtime.taskmanager.Task) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) IOException(java.io.IOException) TimeoutException(java.util.concurrent.TimeoutException) PartitionException(org.apache.flink.runtime.taskexecutor.exceptions.PartitionException) CheckpointException(org.apache.flink.runtime.taskexecutor.exceptions.CheckpointException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) TaskSubmissionException(org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException) TaskException(org.apache.flink.runtime.taskexecutor.exceptions.TaskException) SlotNotActiveException(org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException) SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) IOException(java.io.IOException)

Example 3 with SlotNotFoundException

use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.

the class TaskExecutor method requestSlot.

// ----------------------------------------------------------------------
// Slot allocation RPCs
// ----------------------------------------------------------------------
/**
	 * /**
	 * Requests a slot from the TaskManager
	 *
	 * @param slotId identifying the requested slot
	 * @param jobId identifying the job for which the request is issued
	 * @param allocationId id for the request
	 * @param targetAddress of the job manager requesting the slot
	 * @param rmLeaderId current leader id of the ResourceManager
	 * @throws SlotAllocationException if the slot allocation fails
	 * @return answer to the slot request
	 */
@RpcMethod
public TMSlotRequestReply requestSlot(final SlotID slotId, final JobID jobId, final AllocationID allocationId, final String targetAddress, final UUID rmLeaderId) throws SlotAllocationException {
    log.info("Receive slot request {} for job {} from resource manager with leader id {}.", allocationId, jobId, rmLeaderId);
    if (resourceManagerConnection == null) {
        final String message = "TaskManager is not connected to a resource manager.";
        log.debug(message);
        throw new SlotAllocationException(message);
    }
    if (!resourceManagerConnection.getTargetLeaderId().equals(rmLeaderId)) {
        final String message = "The leader id " + rmLeaderId + " does not match with the leader id of the connected resource manager " + resourceManagerConnection.getTargetLeaderId() + '.';
        log.debug(message);
        throw new SlotAllocationException(message);
    }
    if (taskSlotTable.isSlotFree(slotId.getSlotNumber())) {
        if (taskSlotTable.allocateSlot(slotId.getSlotNumber(), jobId, allocationId, taskManagerConfiguration.getTimeout())) {
            log.info("Allocated slot for {}.", allocationId);
        } else {
            log.info("Could not allocate slot for {}.", allocationId);
            throw new SlotAllocationException("Could not allocate slot.");
        }
    } else if (!taskSlotTable.isAllocated(slotId.getSlotNumber(), jobId, allocationId)) {
        final String message = "The slot " + slotId + " has already been allocated for a different job.";
        log.info(message);
        throw new SlotAllocationException(message);
    }
    if (jobManagerTable.contains(jobId)) {
        offerSlotsToJobManager(jobId);
    } else {
        try {
            jobLeaderService.addJob(jobId, targetAddress);
        } catch (Exception e) {
            // free the allocated slot
            try {
                taskSlotTable.freeSlot(allocationId);
            } catch (SlotNotFoundException slotNotFoundException) {
                // slot no longer existent, this should actually never happen, because we've
                // just allocated the slot. So let's fail hard in this case!
                onFatalError(slotNotFoundException);
            }
            // sanity check
            if (!taskSlotTable.isSlotFree(slotId.getSlotNumber())) {
                onFatalError(new Exception("Could not free slot " + slotId));
            }
            throw new SlotAllocationException("Could not add job to job leader service.", e);
        }
    }
    return new TMSlotRequestRegistered(resourceManagerConnection.getRegistrationId(), getResourceID(), allocationId);
}
Also used : SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) TMSlotRequestRegistered(org.apache.flink.runtime.resourcemanager.messages.taskexecutor.TMSlotRequestRegistered) TimeoutException(java.util.concurrent.TimeoutException) PartitionException(org.apache.flink.runtime.taskexecutor.exceptions.PartitionException) CheckpointException(org.apache.flink.runtime.taskexecutor.exceptions.CheckpointException) SlotAllocationException(org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException) TaskSubmissionException(org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException) TaskException(org.apache.flink.runtime.taskexecutor.exceptions.TaskException) SlotNotActiveException(org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException) SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) IOException(java.io.IOException) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Example 4 with SlotNotFoundException

use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.

the class TaskExecutor method freeSlot.

private void freeSlot(AllocationID allocationId, Throwable cause) {
    Preconditions.checkNotNull(allocationId);
    try {
        int freedSlotIndex = taskSlotTable.freeSlot(allocationId, cause);
        if (freedSlotIndex != -1 && isConnectedToResourceManager()) {
            // the slot was freed. Tell the RM about it
            ResourceManagerGateway resourceManagerGateway = resourceManagerConnection.getTargetGateway();
            resourceManagerGateway.notifySlotAvailable(resourceManagerConnection.getTargetLeaderId(), resourceManagerConnection.getRegistrationId(), new SlotID(getResourceID(), freedSlotIndex));
        }
    } catch (SlotNotFoundException e) {
        log.debug("Could not free slot for allocation id {}.", allocationId, e);
    }
}
Also used : SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) SlotID(org.apache.flink.runtime.clusterframework.types.SlotID) RpcEndpoint(org.apache.flink.runtime.rpc.RpcEndpoint) ResourceManagerGateway(org.apache.flink.runtime.resourcemanager.ResourceManagerGateway)

Example 5 with SlotNotFoundException

use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.

the class MetricUtils method getUsedManagedMemory.

private static long getUsedManagedMemory(TaskSlotTable<?> taskSlotTable) {
    Set<AllocationID> activeTaskAllocationIds = taskSlotTable.getActiveTaskSlotAllocationIds();
    long usedMemory = 0L;
    for (AllocationID allocationID : activeTaskAllocationIds) {
        try {
            MemoryManager taskSlotMemoryManager = taskSlotTable.getTaskMemoryManager(allocationID);
            usedMemory += taskSlotMemoryManager.getMemorySize() - taskSlotMemoryManager.availableMemory();
        } catch (SlotNotFoundException e) {
            LOG.debug("The task slot {} is not present anymore and will be ignored in calculating the amount of used memory.", allocationID);
        }
    }
    return usedMemory;
}
Also used : SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) AllocationID(org.apache.flink.runtime.clusterframework.types.AllocationID) MemoryManager(org.apache.flink.runtime.memory.MemoryManager)

Aggregations

SlotNotFoundException (org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException)11 IOException (java.io.IOException)8 TaskSubmissionException (org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException)7 SlotNotActiveException (org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException)7 TimeoutException (java.util.concurrent.TimeoutException)5 SlotAllocationException (org.apache.flink.runtime.taskexecutor.exceptions.SlotAllocationException)5 TaskException (org.apache.flink.runtime.taskexecutor.exceptions.TaskException)5 JobID (org.apache.flink.api.common.JobID)4 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)4 Task (org.apache.flink.runtime.taskmanager.Task)4 CheckpointException (org.apache.flink.runtime.taskexecutor.exceptions.CheckpointException)3 PartitionException (org.apache.flink.runtime.taskexecutor.exceptions.PartitionException)3 FlinkException (org.apache.flink.util.FlinkException)3 CompletionException (java.util.concurrent.CompletionException)2 CheckpointException (org.apache.flink.runtime.checkpoint.CheckpointException)2 SlotID (org.apache.flink.runtime.clusterframework.types.SlotID)2 LibraryCacheManager (org.apache.flink.runtime.execution.librarycache.LibraryCacheManager)2 JobInformation (org.apache.flink.runtime.executiongraph.JobInformation)2 TaskInformation (org.apache.flink.runtime.executiongraph.TaskInformation)2 ResultPartitionConsumableNotifier (org.apache.flink.runtime.io.network.partition.ResultPartitionConsumableNotifier)2