Search in sources :

Example 21 with TaskManagerGateway

use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.

the class Execution method sendReleaseIntermediateResultPartitionsRpcCall.

private void sendReleaseIntermediateResultPartitionsRpcCall() {
    LOG.info("Discarding the results produced by task execution {}.", attemptId);
    final LogicalSlot slot = assignedResource;
    if (slot != null) {
        final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
        final ShuffleMaster<?> shuffleMaster = getVertex().getExecutionGraphAccessor().getShuffleMaster();
        Set<ResultPartitionID> partitionIds = producedPartitions.values().stream().filter(resultPartitionDeploymentDescriptor -> resultPartitionDeploymentDescriptor.getPartitionType().isPipelined()).map(ResultPartitionDeploymentDescriptor::getShuffleDescriptor).peek(shuffleMaster::releasePartitionExternally).map(ShuffleDescriptor::getResultPartitionID).collect(Collectors.toSet());
        if (!partitionIds.isEmpty()) {
            // TODO For some tests this could be a problem when querying too early if all
            // resources were released
            taskManagerGateway.releasePartitions(getVertex().getJobId(), partitionIds);
        }
    }
}
Also used : TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) IntermediateResultPartitionID(org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID) LogicalSlot(org.apache.flink.runtime.jobmaster.LogicalSlot)

Example 22 with TaskManagerGateway

use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.

the class Execution method notifyCheckpointAborted.

/**
 * Notify the task of this execution about a aborted checkpoint.
 *
 * @param abortCheckpointId of the subsumed checkpoint
 * @param latestCompletedCheckpointId of the latest completed checkpoint
 * @param timestamp of the subsumed checkpoint
 */
public void notifyCheckpointAborted(long abortCheckpointId, long latestCompletedCheckpointId, long timestamp) {
    final LogicalSlot slot = assignedResource;
    if (slot != null) {
        final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
        taskManagerGateway.notifyCheckpointAborted(attemptId, getVertex().getJobId(), abortCheckpointId, latestCompletedCheckpointId, timestamp);
    } else {
        LOG.debug("The execution has no slot assigned. This indicates that the execution is " + "no longer running.");
    }
}
Also used : TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) LogicalSlot(org.apache.flink.runtime.jobmaster.LogicalSlot)

Example 23 with TaskManagerGateway

use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.

the class Execution method sendCancelRpcCall.

/**
 * This method sends a CancelTask message to the instance of the assigned slot.
 *
 * <p>The sending is tried up to NUM_CANCEL_CALL_TRIES times.
 */
private void sendCancelRpcCall(int numberRetries) {
    final LogicalSlot slot = assignedResource;
    if (slot != null) {
        final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
        final ComponentMainThreadExecutor jobMasterMainThreadExecutor = getVertex().getExecutionGraphAccessor().getJobMasterMainThreadExecutor();
        CompletableFuture<Acknowledge> cancelResultFuture = FutureUtils.retry(() -> taskManagerGateway.cancelTask(attemptId, rpcTimeout), numberRetries, jobMasterMainThreadExecutor);
        cancelResultFuture.whenComplete((ack, failure) -> {
            if (failure != null) {
                fail(new Exception("Task could not be canceled.", failure));
            }
        });
    }
}
Also used : ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) LogicalSlot(org.apache.flink.runtime.jobmaster.LogicalSlot) TimeoutException(java.util.concurrent.TimeoutException) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) FlinkException(org.apache.flink.util.FlinkException) JobException(org.apache.flink.runtime.JobException)

Example 24 with TaskManagerGateway

use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.

the class Execution method deploy.

/**
 * Deploys the execution to the previously assigned resource.
 *
 * @throws JobException if the execution cannot be deployed to the assigned resource
 */
public void deploy() throws JobException {
    assertRunningInJobMasterMainThread();
    final LogicalSlot slot = assignedResource;
    checkNotNull(slot, "In order to deploy the execution we first have to assign a resource via tryAssignResource.");
    // The more general check is the rpcTimeout of the deployment call
    if (!slot.isAlive()) {
        throw new JobException("Target slot (TaskManager) for deployment is no longer alive.");
    }
    // make sure exactly one deployment call happens from the correct state
    ExecutionState previous = this.state;
    if (previous == SCHEDULED) {
        if (!transitionState(previous, DEPLOYING)) {
            // this should actually not happen and indicates a race somewhere else
            throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race.");
        }
    } else {
        // vertex may have been cancelled, or it was already scheduled
        throw new IllegalStateException("The vertex must be in SCHEDULED state to be deployed. Found state " + previous);
    }
    if (this != slot.getPayload()) {
        throw new IllegalStateException(String.format("The execution %s has not been assigned to the assigned slot.", this));
    }
    try {
        // race double check, did we fail/cancel and do we need to release the slot?
        if (this.state != DEPLOYING) {
            slot.releaseSlot(new FlinkException("Actual state of execution " + this + " (" + state + ") does not match expected state DEPLOYING."));
            return;
        }
        LOG.info("Deploying {} (attempt #{}) with attempt id {} and vertex id {} to {} with allocation id {}", vertex.getTaskNameWithSubtaskIndex(), attemptNumber, vertex.getCurrentExecutionAttempt().getAttemptId(), vertex.getID(), getAssignedResourceLocation(), slot.getAllocationId());
        final TaskDeploymentDescriptor deployment = TaskDeploymentDescriptorFactory.fromExecutionVertex(vertex, attemptNumber).createDeploymentDescriptor(slot.getAllocationId(), taskRestore, producedPartitions.values());
        // null taskRestore to let it be GC'ed
        taskRestore = null;
        final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
        final ComponentMainThreadExecutor jobMasterMainThreadExecutor = vertex.getExecutionGraphAccessor().getJobMasterMainThreadExecutor();
        getVertex().notifyPendingDeployment(this);
        // We run the submission in the future executor so that the serialization of large TDDs
        // does not block
        // the main thread and sync back to the main thread once submission is completed.
        CompletableFuture.supplyAsync(() -> taskManagerGateway.submitTask(deployment, rpcTimeout), executor).thenCompose(Function.identity()).whenCompleteAsync((ack, failure) -> {
            if (failure == null) {
                vertex.notifyCompletedDeployment(this);
            } else {
                final Throwable actualFailure = ExceptionUtils.stripCompletionException(failure);
                if (actualFailure instanceof TimeoutException) {
                    String taskname = vertex.getTaskNameWithSubtaskIndex() + " (" + attemptId + ')';
                    markFailed(new Exception("Cannot deploy task " + taskname + " - TaskManager (" + getAssignedResourceLocation() + ") not responding after a rpcTimeout of " + rpcTimeout, actualFailure));
                } else {
                    markFailed(actualFailure);
                }
            }
        }, jobMasterMainThreadExecutor);
    } catch (Throwable t) {
        markFailed(t);
    }
}
Also used : JobException(org.apache.flink.runtime.JobException) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) LogicalSlot(org.apache.flink.runtime.jobmaster.LogicalSlot) FlinkException(org.apache.flink.util.FlinkException) TimeoutException(java.util.concurrent.TimeoutException) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) FlinkException(org.apache.flink.util.FlinkException) JobException(org.apache.flink.runtime.JobException) TimeoutException(java.util.concurrent.TimeoutException)

Example 25 with TaskManagerGateway

use of org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway in project flink by apache.

the class Execution method notifyCheckpointOnComplete.

/**
 * Notify the task of this execution about a completed checkpoint and the last subsumed
 * checkpoint id if possible.
 *
 * @param completedCheckpointId of the completed checkpoint
 * @param completedTimestamp of the completed checkpoint
 * @param lastSubsumedCheckpointId of the last subsumed checkpoint, a value of {@link
 *     org.apache.flink.runtime.checkpoint.CheckpointStoreUtil#INVALID_CHECKPOINT_ID} means no
 *     checkpoint has been subsumed.
 */
public void notifyCheckpointOnComplete(long completedCheckpointId, long completedTimestamp, long lastSubsumedCheckpointId) {
    final LogicalSlot slot = assignedResource;
    if (slot != null) {
        final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
        taskManagerGateway.notifyCheckpointOnComplete(attemptId, getVertex().getJobId(), completedCheckpointId, completedTimestamp, lastSubsumedCheckpointId);
    } else {
        LOG.debug("The execution has no slot assigned. This indicates that the execution is " + "no longer running.");
    }
}
Also used : TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) LogicalSlot(org.apache.flink.runtime.jobmaster.LogicalSlot)

Aggregations

TaskManagerGateway (org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway)26 SimpleSlot (org.apache.flink.runtime.instance.SimpleSlot)12 FlinkCompletableFuture (org.apache.flink.runtime.concurrent.impl.FlinkCompletableFuture)10 Test (org.junit.Test)10 JobID (org.apache.flink.api.common.JobID)9 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)8 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)8 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)8 Time (org.apache.flink.api.common.time.Time)7 AllocatedSlot (org.apache.flink.runtime.jobmanager.slots.AllocatedSlot)7 JobException (org.apache.flink.runtime.JobException)6 LogicalSlot (org.apache.flink.runtime.jobmaster.LogicalSlot)6 TimeoutException (java.util.concurrent.TimeoutException)5 Slot (org.apache.flink.runtime.instance.Slot)5 Acknowledge (org.apache.flink.runtime.messages.Acknowledge)5 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)4 Future (org.apache.flink.runtime.concurrent.Future)4 SlotOwner (org.apache.flink.runtime.jobmanager.slots.SlotOwner)4 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)4 ArrayList (java.util.ArrayList)3