Search in sources :

Example 1 with TaskSubmissionException

use of org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException in project flink by apache.

the class TaskExecutor method submitTask.

// ----------------------------------------------------------------------
// Task lifecycle RPCs
// ----------------------------------------------------------------------
@Override
public CompletableFuture<Acknowledge> submitTask(TaskDeploymentDescriptor tdd, JobMasterId jobMasterId, Time timeout) {
    try {
        final JobID jobId = tdd.getJobId();
        final ExecutionAttemptID executionAttemptID = tdd.getExecutionAttemptId();
        final JobTable.Connection jobManagerConnection = jobTable.getConnection(jobId).orElseThrow(() -> {
            final String message = "Could not submit task because there is no JobManager " + "associated for the job " + jobId + '.';
            log.debug(message);
            return new TaskSubmissionException(message);
        });
        if (!Objects.equals(jobManagerConnection.getJobMasterId(), jobMasterId)) {
            final String message = "Rejecting the task submission because the job manager leader id " + jobMasterId + " does not match the expected job manager leader id " + jobManagerConnection.getJobMasterId() + '.';
            log.debug(message);
            throw new TaskSubmissionException(message);
        }
        if (!taskSlotTable.tryMarkSlotActive(jobId, tdd.getAllocationId())) {
            final String message = "No task slot allocated for job ID " + jobId + " and allocation ID " + tdd.getAllocationId() + '.';
            log.debug(message);
            throw new TaskSubmissionException(message);
        }
        // re-integrate offloaded data:
        try {
            tdd.loadBigData(taskExecutorBlobService.getPermanentBlobService());
        } catch (IOException | ClassNotFoundException e) {
            throw new TaskSubmissionException("Could not re-integrate offloaded TaskDeploymentDescriptor data.", e);
        }
        // deserialize the pre-serialized information
        final JobInformation jobInformation;
        final TaskInformation taskInformation;
        try {
            jobInformation = tdd.getSerializedJobInformation().deserializeValue(getClass().getClassLoader());
            taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
        } catch (IOException | ClassNotFoundException e) {
            throw new TaskSubmissionException("Could not deserialize the job or task information.", e);
        }
        if (!jobId.equals(jobInformation.getJobId())) {
            throw new TaskSubmissionException("Inconsistent job ID information inside TaskDeploymentDescriptor (" + tdd.getJobId() + " vs. " + jobInformation.getJobId() + ")");
        }
        TaskManagerJobMetricGroup jobGroup = taskManagerMetricGroup.addJob(jobInformation.getJobId(), jobInformation.getJobName());
        // note that a pre-existing job group can NOT be closed concurrently - this is done by
        // the same TM thread in removeJobMetricsGroup
        TaskMetricGroup taskMetricGroup = jobGroup.addTask(taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskInformation.getTaskName(), tdd.getSubtaskIndex(), tdd.getAttemptNumber());
        InputSplitProvider inputSplitProvider = new RpcInputSplitProvider(jobManagerConnection.getJobManagerGateway(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskManagerConfiguration.getRpcTimeout());
        final TaskOperatorEventGateway taskOperatorEventGateway = new RpcTaskOperatorEventGateway(jobManagerConnection.getJobManagerGateway(), executionAttemptID, (t) -> runAsync(() -> failTask(executionAttemptID, t)));
        TaskManagerActions taskManagerActions = jobManagerConnection.getTaskManagerActions();
        CheckpointResponder checkpointResponder = jobManagerConnection.getCheckpointResponder();
        GlobalAggregateManager aggregateManager = jobManagerConnection.getGlobalAggregateManager();
        LibraryCacheManager.ClassLoaderHandle classLoaderHandle = jobManagerConnection.getClassLoaderHandle();
        ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = jobManagerConnection.getResultPartitionConsumableNotifier();
        PartitionProducerStateChecker partitionStateChecker = jobManagerConnection.getPartitionStateChecker();
        final TaskLocalStateStore localStateStore = localStateStoresManager.localStateStoreForSubtask(jobId, tdd.getAllocationId(), taskInformation.getJobVertexId(), tdd.getSubtaskIndex());
        // TODO: Pass config value from user program and do overriding here.
        final StateChangelogStorage<?> changelogStorage;
        try {
            changelogStorage = changelogStoragesManager.stateChangelogStorageForJob(jobId, taskManagerConfiguration.getConfiguration(), jobGroup);
        } catch (IOException e) {
            throw new TaskSubmissionException(e);
        }
        final JobManagerTaskRestore taskRestore = tdd.getTaskRestore();
        final TaskStateManager taskStateManager = new TaskStateManagerImpl(jobId, tdd.getExecutionAttemptId(), localStateStore, changelogStorage, taskRestore, checkpointResponder);
        MemoryManager memoryManager;
        try {
            memoryManager = taskSlotTable.getTaskMemoryManager(tdd.getAllocationId());
        } catch (SlotNotFoundException e) {
            throw new TaskSubmissionException("Could not submit task.", e);
        }
        Task task = new Task(jobInformation, taskInformation, tdd.getExecutionAttemptId(), tdd.getAllocationId(), tdd.getSubtaskIndex(), tdd.getAttemptNumber(), tdd.getProducedPartitions(), tdd.getInputGates(), memoryManager, taskExecutorServices.getIOManager(), taskExecutorServices.getShuffleEnvironment(), taskExecutorServices.getKvStateService(), taskExecutorServices.getBroadcastVariableManager(), taskExecutorServices.getTaskEventDispatcher(), externalResourceInfoProvider, taskStateManager, taskManagerActions, inputSplitProvider, checkpointResponder, taskOperatorEventGateway, aggregateManager, classLoaderHandle, fileCache, taskManagerConfiguration, taskMetricGroup, resultPartitionConsumableNotifier, partitionStateChecker, getRpcService().getScheduledExecutor());
        taskMetricGroup.gauge(MetricNames.IS_BACK_PRESSURED, task::isBackPressured);
        log.info("Received task {} ({}), deploy into slot with allocation id {}.", task.getTaskInfo().getTaskNameWithSubtasks(), tdd.getExecutionAttemptId(), tdd.getAllocationId());
        boolean taskAdded;
        try {
            taskAdded = taskSlotTable.addTask(task);
        } catch (SlotNotFoundException | SlotNotActiveException e) {
            throw new TaskSubmissionException("Could not submit task.", e);
        }
        if (taskAdded) {
            task.startTaskThread();
            setupResultPartitionBookkeeping(tdd.getJobId(), tdd.getProducedPartitions(), task.getTerminationFuture());
            return CompletableFuture.completedFuture(Acknowledge.get());
        } else {
            final String message = "TaskManager already contains a task for id " + task.getExecutionId() + '.';
            log.debug(message);
            throw new TaskSubmissionException(message);
        }
    } catch (TaskSubmissionException e) {
        return FutureUtils.completedExceptionally(e);
    }
}
Also used : SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) TaskStateManagerImpl(org.apache.flink.runtime.state.TaskStateManagerImpl) Task(org.apache.flink.runtime.taskmanager.Task) SlotNotActiveException(org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException) RpcInputSplitProvider(org.apache.flink.runtime.taskexecutor.rpc.RpcInputSplitProvider) RpcTaskOperatorEventGateway(org.apache.flink.runtime.taskexecutor.rpc.RpcTaskOperatorEventGateway) TaskOperatorEventGateway(org.apache.flink.runtime.jobgraph.tasks.TaskOperatorEventGateway) JobManagerTaskRestore(org.apache.flink.runtime.checkpoint.JobManagerTaskRestore) RpcTaskOperatorEventGateway(org.apache.flink.runtime.taskexecutor.rpc.RpcTaskOperatorEventGateway) TaskManagerActions(org.apache.flink.runtime.taskmanager.TaskManagerActions) TaskSubmissionException(org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException) InputSplitProvider(org.apache.flink.runtime.jobgraph.tasks.InputSplitProvider) RpcInputSplitProvider(org.apache.flink.runtime.taskexecutor.rpc.RpcInputSplitProvider) ResultPartitionConsumableNotifier(org.apache.flink.runtime.io.network.partition.ResultPartitionConsumableNotifier) RpcResultPartitionConsumableNotifier(org.apache.flink.runtime.taskexecutor.rpc.RpcResultPartitionConsumableNotifier) JobInformation(org.apache.flink.runtime.executiongraph.JobInformation) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) TaskInformation(org.apache.flink.runtime.executiongraph.TaskInformation) TaskLocalStateStore(org.apache.flink.runtime.state.TaskLocalStateStore) TaskMetricGroup(org.apache.flink.runtime.metrics.groups.TaskMetricGroup) RpcCheckpointResponder(org.apache.flink.runtime.taskexecutor.rpc.RpcCheckpointResponder) CheckpointResponder(org.apache.flink.runtime.taskmanager.CheckpointResponder) TaskManagerJobMetricGroup(org.apache.flink.runtime.metrics.groups.TaskManagerJobMetricGroup) IOException(java.io.IOException) LibraryCacheManager(org.apache.flink.runtime.execution.librarycache.LibraryCacheManager) TaskStateManager(org.apache.flink.runtime.state.TaskStateManager) MemoryManager(org.apache.flink.runtime.memory.MemoryManager) RpcGlobalAggregateManager(org.apache.flink.runtime.taskexecutor.rpc.RpcGlobalAggregateManager) JobID(org.apache.flink.api.common.JobID)

Example 2 with TaskSubmissionException

use of org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException in project flink by apache.

the class TaskExecutor method submitTask.

// ======================================================================
//  RPC methods
// ======================================================================
// ----------------------------------------------------------------------
// Task lifecycle RPCs
// ----------------------------------------------------------------------
@RpcMethod
public Acknowledge submitTask(TaskDeploymentDescriptor tdd, UUID jobManagerLeaderId) throws TaskSubmissionException {
    // first, deserialize the pre-serialized information
    final JobInformation jobInformation;
    final TaskInformation taskInformation;
    try {
        jobInformation = tdd.getSerializedJobInformation().deserializeValue(getClass().getClassLoader());
        taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
    } catch (IOException | ClassNotFoundException e) {
        throw new TaskSubmissionException("Could not deserialize the job or task information.", e);
    }
    final JobID jobId = jobInformation.getJobId();
    final JobManagerConnection jobManagerConnection = jobManagerTable.get(jobId);
    if (jobManagerConnection == null) {
        final String message = "Could not submit task because there is no JobManager " + "associated for the job " + jobId + '.';
        log.debug(message);
        throw new TaskSubmissionException(message);
    }
    if (!jobManagerConnection.getLeaderId().equals(jobManagerLeaderId)) {
        final String message = "Rejecting the task submission because the job manager leader id " + jobManagerLeaderId + " does not match the expected job manager leader id " + jobManagerConnection.getLeaderId() + '.';
        log.debug(message);
        throw new TaskSubmissionException(message);
    }
    if (!taskSlotTable.existsActiveSlot(jobId, tdd.getAllocationId())) {
        final String message = "No task slot allocated for job ID " + jobId + " and allocation ID " + tdd.getAllocationId() + '.';
        log.debug(message);
        throw new TaskSubmissionException(message);
    }
    TaskMetricGroup taskMetricGroup = taskManagerMetricGroup.addTaskForJob(jobInformation.getJobId(), jobInformation.getJobName(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskInformation.getTaskName(), tdd.getSubtaskIndex(), tdd.getAttemptNumber());
    InputSplitProvider inputSplitProvider = new RpcInputSplitProvider(jobManagerConnection.getLeaderId(), jobManagerConnection.getJobManagerGateway(), jobInformation.getJobId(), taskInformation.getJobVertexId(), tdd.getExecutionAttemptId(), taskManagerConfiguration.getTimeout());
    TaskManagerActions taskManagerActions = jobManagerConnection.getTaskManagerActions();
    CheckpointResponder checkpointResponder = jobManagerConnection.getCheckpointResponder();
    LibraryCacheManager libraryCache = jobManagerConnection.getLibraryCacheManager();
    ResultPartitionConsumableNotifier resultPartitionConsumableNotifier = jobManagerConnection.getResultPartitionConsumableNotifier();
    PartitionProducerStateChecker partitionStateChecker = jobManagerConnection.getPartitionStateChecker();
    Task task = new Task(jobInformation, taskInformation, tdd.getExecutionAttemptId(), tdd.getAllocationId(), tdd.getSubtaskIndex(), tdd.getAttemptNumber(), tdd.getProducedPartitions(), tdd.getInputGates(), tdd.getTargetSlotNumber(), tdd.getTaskStateHandles(), memoryManager, ioManager, networkEnvironment, broadcastVariableManager, taskManagerActions, inputSplitProvider, checkpointResponder, libraryCache, fileCache, taskManagerConfiguration, taskMetricGroup, resultPartitionConsumableNotifier, partitionStateChecker, getRpcService().getExecutor());
    log.info("Received task {}.", task.getTaskInfo().getTaskNameWithSubtasks());
    boolean taskAdded;
    try {
        taskAdded = taskSlotTable.addTask(task);
    } catch (SlotNotFoundException | SlotNotActiveException e) {
        throw new TaskSubmissionException("Could not submit task.", e);
    }
    if (taskAdded) {
        task.startTaskThread();
        return Acknowledge.get();
    } else {
        final String message = "TaskManager already contains a task for id " + task.getExecutionId() + '.';
        log.debug(message);
        throw new TaskSubmissionException(message);
    }
}
Also used : SlotNotFoundException(org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException) JobInformation(org.apache.flink.runtime.executiongraph.JobInformation) Task(org.apache.flink.runtime.taskmanager.Task) TaskInformation(org.apache.flink.runtime.executiongraph.TaskInformation) TaskMetricGroup(org.apache.flink.runtime.metrics.groups.TaskMetricGroup) CheckpointResponder(org.apache.flink.runtime.taskmanager.CheckpointResponder) RpcCheckpointResponder(org.apache.flink.runtime.taskexecutor.rpc.RpcCheckpointResponder) SlotNotActiveException(org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException) RpcInputSplitProvider(org.apache.flink.runtime.taskexecutor.rpc.RpcInputSplitProvider) IOException(java.io.IOException) BlobLibraryCacheManager(org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager) LibraryCacheManager(org.apache.flink.runtime.execution.librarycache.LibraryCacheManager) TaskManagerActions(org.apache.flink.runtime.taskmanager.TaskManagerActions) TaskSubmissionException(org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException) PartitionProducerStateChecker(org.apache.flink.runtime.io.network.netty.PartitionProducerStateChecker) InputSplitProvider(org.apache.flink.runtime.jobgraph.tasks.InputSplitProvider) RpcInputSplitProvider(org.apache.flink.runtime.taskexecutor.rpc.RpcInputSplitProvider) ResultPartitionConsumableNotifier(org.apache.flink.runtime.io.network.partition.ResultPartitionConsumableNotifier) RpcResultPartitionConsumableNotifier(org.apache.flink.runtime.taskexecutor.rpc.RpcResultPartitionConsumableNotifier) JobID(org.apache.flink.api.common.JobID) RpcMethod(org.apache.flink.runtime.rpc.RpcMethod)

Aggregations

IOException (java.io.IOException)2 JobID (org.apache.flink.api.common.JobID)2 LibraryCacheManager (org.apache.flink.runtime.execution.librarycache.LibraryCacheManager)2 JobInformation (org.apache.flink.runtime.executiongraph.JobInformation)2 TaskInformation (org.apache.flink.runtime.executiongraph.TaskInformation)2 ResultPartitionConsumableNotifier (org.apache.flink.runtime.io.network.partition.ResultPartitionConsumableNotifier)2 InputSplitProvider (org.apache.flink.runtime.jobgraph.tasks.InputSplitProvider)2 TaskMetricGroup (org.apache.flink.runtime.metrics.groups.TaskMetricGroup)2 TaskSubmissionException (org.apache.flink.runtime.taskexecutor.exceptions.TaskSubmissionException)2 RpcCheckpointResponder (org.apache.flink.runtime.taskexecutor.rpc.RpcCheckpointResponder)2 RpcInputSplitProvider (org.apache.flink.runtime.taskexecutor.rpc.RpcInputSplitProvider)2 RpcResultPartitionConsumableNotifier (org.apache.flink.runtime.taskexecutor.rpc.RpcResultPartitionConsumableNotifier)2 SlotNotActiveException (org.apache.flink.runtime.taskexecutor.slot.SlotNotActiveException)2 SlotNotFoundException (org.apache.flink.runtime.taskexecutor.slot.SlotNotFoundException)2 CheckpointResponder (org.apache.flink.runtime.taskmanager.CheckpointResponder)2 Task (org.apache.flink.runtime.taskmanager.Task)2 TaskManagerActions (org.apache.flink.runtime.taskmanager.TaskManagerActions)2 JobManagerTaskRestore (org.apache.flink.runtime.checkpoint.JobManagerTaskRestore)1 BlobLibraryCacheManager (org.apache.flink.runtime.execution.librarycache.BlobLibraryCacheManager)1 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)1