use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by splunk.
the class TaskExecutor method submitTask.
// ----------------------------------------------------------------------
// Task lifecycle RPCs
// ----------------------------------------------------------------------
@Override
public CompletableFuture<Acknowledge> submitTask(TaskDeploymentDescriptor tdd, JobMasterId jobMasterId, Time timeout) {
try {
final JobID jobId = tdd.getJobId();
final ExecutionAttemptID executionAttemptID = tdd.getExecutionAttemptId();
final JobTable.Connection jobManagerConnection = jobTable.getConnection(jobId).orElseThrow(() -> {
final String message = "Could not submit task because there is no JobManager " + "associated for the job " + jobId + '.';
log.debug(message);
return new TaskSubmissionException(message);
});
if (!Objects.equals(jobManagerConnection.getJobMasterId(), jobMasterId)) {
final String message = "Rejecting the task submission because the job manager leader id " + jobMasterId + " does not match the expected job manager leader id " + jobManagerConnection.getJobMasterId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
if (!taskSlotTable.tryMarkSlotActive(jobId, tdd.getAllocationId())) {
final String message = "No task slot allocated for job ID " + jobId + " and allocation ID " + tdd.getAllocationId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
// re-integrate offloaded data:
try {
tdd.loadBigData(taskExecutorBlobService.getPermanentBlobService());
} catch (IOException | ClassNotFoundException e) {
throw new TaskSubmissionException("Could not re-integrate offloaded TaskDeploymentDescriptor data.", e);
}
// deserialize the pre-serialized information
final JobInformation jobInformation;
final TaskInformation taskInformation;
try {
jobInformation = tdd.getSerializedJobInformation().deserializeValue(getClass().getClassLoader());
taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
} catch (IOException | ClassNotFoundException e) {
throw new TaskSubmissionException("Could not deserialize the job or task information.", e);
}
if (!jobId.equals(jobInformation.getJobId())) {
throw new TaskSubmissionException("Inconsistent job ID information inside TaskDeploymentDescriptor (" + tdd.getJobId() + " vs. " + jobInformation.getJobId() + ")");
}
TaskManagerJobMetricGroup jobGroup = taskManagerMetricGroup.addJob(jobInformation.getJobId(), jobInformation.getJobName());
// note that a pre-existing job group can NOT be closed concurrently - this is done by
// the same TM thread in removeJobMetricsGroup
TaskMetricGroup taskMetricGroup = jobGroup.addTask(taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskInformation.getTaskName(), tdd.getSubtaskIndex(), tdd.getAttemptNumber());
InputSplitProvider inputSplitProvider = new RpcInputSplitProvider(jobManagerConnection.getJobManagerGateway(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskManagerConfiguration.getRpcTimeout());
final TaskOperatorEventGateway taskOperatorEventGateway = new RpcTaskOperatorEventGateway(jobManagerConnection.getJobManagerGateway(), executionAttemptID, (t) -> runAsync(() -> failTask(executionAttemptID, t)));
TaskManagerActions taskManagerActions = jobManagerConnection.getTaskManagerActions();
CheckpointResponder checkpointResponder = jobManagerConnection.getCheckpointResponder();
GlobalAggregateManager aggregateManager = jobManagerConnection.getGlobalAggregateManager();
LibraryCacheManager.ClassLoaderHandle classLoaderHandle = jobManagerConnection.getClassLoaderHandle();
ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = jobManagerConnection.getResultPartitionConsumableNotifier();
PartitionProducerStateChecker partitionStateChecker = jobManagerConnection.getPartitionStateChecker();
final TaskLocalStateStore localStateStore = localStateStoresManager.localStateStoreForSubtask(jobId, tdd.getAllocationId(), taskInformation.getJobVertexId(), tdd.getSubtaskIndex());
// TODO: Pass config value from user program and do overriding here.
final StateChangelogStorage<?> changelogStorage;
try {
changelogStorage = changelogStoragesManager.stateChangelogStorageForJob(jobId, taskManagerConfiguration.getConfiguration(), jobGroup);
} catch (IOException e) {
throw new TaskSubmissionException(e);
}
final JobManagerTaskRestore taskRestore = tdd.getTaskRestore();
final TaskStateManager taskStateManager = new TaskStateManagerImpl(jobId, tdd.getExecutionAttemptId(), localStateStore, changelogStorage, taskRestore, checkpointResponder);
MemoryManager memoryManager;
try {
memoryManager = taskSlotTable.getTaskMemoryManager(tdd.getAllocationId());
} catch (SlotNotFoundException e) {
throw new TaskSubmissionException("Could not submit task.", e);
}
Task task = new Task(jobInformation, taskInformation, tdd.getExecutionAttemptId(), tdd.getAllocationId(), tdd.getSubtaskIndex(), tdd.getAttemptNumber(), tdd.getProducedPartitions(), tdd.getInputGates(), memoryManager, taskExecutorServices.getIOManager(), taskExecutorServices.getShuffleEnvironment(), taskExecutorServices.getKvStateService(), taskExecutorServices.getBroadcastVariableManager(), taskExecutorServices.getTaskEventDispatcher(), externalResourceInfoProvider, taskStateManager, taskManagerActions, inputSplitProvider, checkpointResponder, taskOperatorEventGateway, aggregateManager, classLoaderHandle, fileCache, taskManagerConfiguration, taskMetricGroup, resultPartitionConsumableNotifier, partitionStateChecker, getRpcService().getScheduledExecutor());
taskMetricGroup.gauge(MetricNames.IS_BACK_PRESSURED, task::isBackPressured);
log.info("Received task {} ({}), deploy into slot with allocation id {}.", task.getTaskInfo().getTaskNameWithSubtasks(), tdd.getExecutionAttemptId(), tdd.getAllocationId());
boolean taskAdded;
try {
taskAdded = taskSlotTable.addTask(task);
} catch (SlotNotFoundException | SlotNotActiveException e) {
throw new TaskSubmissionException("Could not submit task.", e);
}
if (taskAdded) {
task.startTaskThread();
setupResultPartitionBookkeeping(tdd.getJobId(), tdd.getProducedPartitions(), task.getTerminationFuture());
return CompletableFuture.completedFuture(Acknowledge.get());
} else {
final String message = "TaskManager already contains a task for id " + task.getExecutionId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
} catch (TaskSubmissionException e) {
return FutureUtils.completedExceptionally(e);
}
}
use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by splunk.
the class TaskExecutor method disconnectJobManagerConnection.
private void disconnectJobManagerConnection(JobTable.Connection jobManagerConnection, Exception cause) {
final JobID jobId = jobManagerConnection.getJobId();
if (log.isDebugEnabled()) {
log.debug("Close JobManager connection for job {}.", jobId, cause);
} else {
log.info("Close JobManager connection for job {}.", jobId);
}
// 1. fail tasks running under this JobID
Iterator<Task> tasks = taskSlotTable.getTasks(jobId);
final FlinkException failureCause = new FlinkException(String.format("Disconnect from JobManager responsible for %s.", jobId), cause);
while (tasks.hasNext()) {
tasks.next().failExternally(failureCause);
}
// 2. Move the active slots to state allocated (possible to time out again)
Set<AllocationID> activeSlotAllocationIDs = taskSlotTable.getActiveTaskSlotAllocationIdsPerJob(jobId);
final FlinkException freeingCause = new FlinkException("Slot could not be marked inactive.");
for (AllocationID activeSlotAllocationID : activeSlotAllocationIDs) {
try {
if (!taskSlotTable.markSlotInactive(activeSlotAllocationID, taskManagerConfiguration.getSlotTimeout())) {
freeSlotInternal(activeSlotAllocationID, freeingCause);
}
} catch (SlotNotFoundException e) {
log.debug("Could not mark the slot {} inactive.", activeSlotAllocationID, e);
}
}
// 3. Disassociate from the JobManager
try {
jobManagerHeartbeatManager.unmonitorTarget(jobManagerConnection.getResourceId());
disassociateFromJobManager(jobManagerConnection, cause);
} catch (IOException e) {
log.warn("Could not properly disassociate from JobManager {}.", jobManagerConnection.getJobManagerGateway().getAddress(), e);
}
jobManagerConnection.disconnect();
}
use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by splunk.
the class TaskExecutor method handleAcceptedSlotOffers.
@Nonnull
private BiConsumer<Iterable<SlotOffer>, Throwable> handleAcceptedSlotOffers(JobID jobId, JobMasterGateway jobMasterGateway, JobMasterId jobMasterId, Collection<SlotOffer> offeredSlots, UUID offerId) {
return (Iterable<SlotOffer> acceptedSlots, Throwable throwable) -> {
// check if this is the latest offer
if (!offerId.equals(currentSlotOfferPerJob.get(jobId))) {
// If this offer is outdated then it can be safely ignored.
// If the response for a given slot is identical in both offers (accepted/rejected),
// then this is naturally the case since the end-result is the same.
// If the responses differ, then there are 2 cases to consider:
// 1) initially rejected, later accepted
// This can happen when the resource requirements of a job increases between
// offers.
// In this case the first response MUST be ignored, so that
// the slot can be properly activated when the second response arrives.
// 2) initially accepted, later rejected
// This can happen when the resource requirements of a job decrease between
// offers.
// In this case the first response MAY be ignored, because the job no longer
// requires the slot (and already has initiated steps to free it) and we can thus
// assume that any in-flight task submissions are no longer relevant for the job
// execution.
log.debug("Discard slot offer response since there is a newer offer for the job {}.", jobId);
return;
}
if (throwable != null) {
if (throwable instanceof TimeoutException) {
log.info("Slot offering to JobManager did not finish in time. Retrying the slot offering.");
// We ran into a timeout. Try again.
offerSlotsToJobManager(jobId);
} else {
log.warn("Slot offering to JobManager failed. Freeing the slots " + "and returning them to the ResourceManager.", throwable);
// We encountered an exception. Free the slots and return them to the RM.
for (SlotOffer reservedSlot : offeredSlots) {
freeSlotInternal(reservedSlot.getAllocationId(), throwable);
}
}
} else {
// check if the response is still valid
if (isJobManagerConnectionValid(jobId, jobMasterId)) {
// mark accepted slots active
for (SlotOffer acceptedSlot : acceptedSlots) {
final AllocationID allocationId = acceptedSlot.getAllocationId();
try {
if (!taskSlotTable.markSlotActive(allocationId)) {
// the slot is either free or releasing at the moment
final String message = "Could not mark slot " + allocationId + " active.";
log.debug(message);
jobMasterGateway.failSlot(getResourceID(), allocationId, new FlinkException(message));
}
} catch (SlotNotFoundException e) {
final String message = "Could not mark slot " + allocationId + " active.";
jobMasterGateway.failSlot(getResourceID(), allocationId, new FlinkException(message));
}
offeredSlots.remove(acceptedSlot);
}
final Exception e = new Exception("The slot was rejected by the JobManager.");
for (SlotOffer rejectedSlot : offeredSlots) {
freeSlotInternal(rejectedSlot.getAllocationId(), e);
}
} else {
// discard the response since there is a new leader for the job
log.debug("Discard slot offer response since there is a new leader " + "for the job {}.", jobId);
}
}
};
}
use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.
the class TaskExecutor method submitTask.
// ======================================================================
// RPC methods
// ======================================================================
// ----------------------------------------------------------------------
// Task lifecycle RPCs
// ----------------------------------------------------------------------
@RpcMethod
public Acknowledge submitTask(TaskDeploymentDescriptor tdd, UUID jobManagerLeaderId) throws TaskSubmissionException {
// first, deserialize the pre-serialized information
final JobInformation jobInformation;
final TaskInformation taskInformation;
try {
jobInformation = tdd.getSerializedJobInformation().deserializeValue(getClass().getClassLoader());
taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
} catch (IOException | ClassNotFoundException e) {
throw new TaskSubmissionException("Could not deserialize the job or task information.", e);
}
final JobID jobId = jobInformation.getJobId();
final JobManagerConnection jobManagerConnection = jobManagerTable.get(jobId);
if (jobManagerConnection == null) {
final String message = "Could not submit task because there is no JobManager " + "associated for the job " + jobId + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
if (!jobManagerConnection.getLeaderId().equals(jobManagerLeaderId)) {
final String message = "Rejecting the task submission because the job manager leader id " + jobManagerLeaderId + " does not match the expected job manager leader id " + jobManagerConnection.getLeaderId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
if (!taskSlotTable.existsActiveSlot(jobId, tdd.getAllocationId())) {
final String message = "No task slot allocated for job ID " + jobId + " and allocation ID " + tdd.getAllocationId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
TaskMetricGroup taskMetricGroup = taskManagerMetricGroup.addTaskForJob(jobInformation.getJobId(), jobInformation.getJobName(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskInformation.getTaskName(), tdd.getSubtaskIndex(), tdd.getAttemptNumber());
InputSplitProvider inputSplitProvider = new RpcInputSplitProvider(jobManagerConnection.getLeaderId(), jobManagerConnection.getJobManagerGateway(), jobInformation.getJobId(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskManagerConfiguration.getTimeout());
TaskManagerActions taskManagerActions = jobManagerConnection.getTaskManagerActions();
CheckpointResponder checkpointResponder = jobManagerConnection.getCheckpointResponder();
LibraryCacheManager libraryCache = jobManagerConnection.getLibraryCacheManager();
ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = jobManagerConnection.getResultPartitionConsumableNotifier();
PartitionProducerStateChecker partitionStateChecker = jobManagerConnection.getPartitionStateChecker();
Task task = new Task(jobInformation, taskInformation, tdd.getExecutionAttemptId(), tdd.getAllocationId(), tdd.getSubtaskIndex(), tdd.getAttemptNumber(), tdd.getProducedPartitions(), tdd.getInputGates(), tdd.getTargetSlotNumber(), tdd.getTaskStateHandles(), memoryManager, ioManager, networkEnvironment, broadcastVariableManager, taskManagerActions, inputSplitProvider, checkpointResponder, libraryCache, fileCache, taskManagerConfiguration, taskMetricGroup, resultPartitionConsumableNotifier, partitionStateChecker, getRpcService().getExecutor());
log.info("Received task {}.", task.getTaskInfo().getTaskNameWithSubtasks());
boolean taskAdded;
try {
taskAdded = taskSlotTable.addTask(task);
} catch (SlotNotFoundException | SlotNotActiveException e) {
throw new TaskSubmissionException("Could not submit task.", e);
}
if (taskAdded) {
task.startTaskThread();
return Acknowledge.get();
} else {
final String message = "TaskManager already contains a task for id " + task.getExecutionId() + '.';
log.debug(message);
throw new TaskSubmissionException(message);
}
}
use of org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException in project flink by apache.
the class TaskExecutor method allocateSlotForJob.
private boolean allocateSlotForJob(JobID jobId, SlotID slotId, AllocationID allocationId, ResourceProfile resourceProfile, String targetAddress) throws SlotAllocationException {
allocateSlot(slotId, jobId, allocationId, resourceProfile);
final JobTable.Job job;
try {
job = jobTable.getOrCreateJob(jobId, () -> registerNewJobAndCreateServices(jobId, targetAddress));
} catch (Exception e) {
// free the allocated slot
try {
taskSlotTable.freeSlot(allocationId);
} catch (SlotNotFoundException slotNotFoundException) {
// slot no longer existent, this should actually never happen, because we've
// just allocated the slot. So let's fail hard in this case!
onFatalError(slotNotFoundException);
}
// release local state under the allocation id.
localStateStoresManager.releaseLocalStateForAllocationId(allocationId);
// sanity check
if (!taskSlotTable.isSlotFree(slotId.getSlotNumber())) {
onFatalError(new Exception("Could not free slot " + slotId));
}
throw new SlotAllocationException("Could not create new job.", e);
}
return job.isConnected();
}
Aggregations