Search in sources :

Example 6 with ComponentMainThreadExecutor

use of org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor in project flink by apache.

the class DefaultSchedulerTest method testProducedPartitionRegistrationTimeout.

@Test
public void testProducedPartitionRegistrationTimeout() throws Exception {
    ScheduledExecutorService scheduledExecutorService = null;
    try {
        scheduledExecutorService = Executors.newSingleThreadScheduledExecutor();
        final ComponentMainThreadExecutor mainThreadExecutor = ComponentMainThreadExecutorServiceAdapter.forSingleThreadExecutor(scheduledExecutorService);
        shuffleMaster.setAutoCompleteRegistration(false);
        final JobGraph jobGraph = nonParallelSourceSinkJobGraph();
        timeout = Time.milliseconds(1);
        createSchedulerAndStartScheduling(jobGraph, mainThreadExecutor);
        testExecutionVertexOperations.awaitCanceledVertices(2);
        testExecutionVertexOperations.awaitFailedVertices(1);
    } finally {
        if (scheduledExecutorService != null) {
            scheduledExecutorService.shutdown();
        }
    }
}
Also used : DirectScheduledExecutorService(org.apache.flink.runtime.testutils.DirectScheduledExecutorService) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) AdaptiveSchedulerTest(org.apache.flink.runtime.scheduler.adaptive.AdaptiveSchedulerTest) Test(org.junit.Test)

Example 7 with ComponentMainThreadExecutor

use of org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor in project flink by apache.

the class Execution method sendCancelRpcCall.

/**
 * This method sends a CancelTask message to the instance of the assigned slot.
 *
 * <p>The sending is tried up to NUM_CANCEL_CALL_TRIES times.
 */
private void sendCancelRpcCall(int numberRetries) {
    final LogicalSlot slot = assignedResource;
    if (slot != null) {
        final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
        final ComponentMainThreadExecutor jobMasterMainThreadExecutor = getVertex().getExecutionGraphAccessor().getJobMasterMainThreadExecutor();
        CompletableFuture<Acknowledge> cancelResultFuture = FutureUtils.retry(() -> taskManagerGateway.cancelTask(attemptId, rpcTimeout), numberRetries, jobMasterMainThreadExecutor);
        cancelResultFuture.whenComplete((ack, failure) -> {
            if (failure != null) {
                fail(new Exception("Task could not be canceled.", failure));
            }
        });
    }
}
Also used : ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) LogicalSlot(org.apache.flink.runtime.jobmaster.LogicalSlot) TimeoutException(java.util.concurrent.TimeoutException) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) FlinkException(org.apache.flink.util.FlinkException) JobException(org.apache.flink.runtime.JobException)

Example 8 with ComponentMainThreadExecutor

use of org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor in project flink by apache.

the class Execution method deploy.

/**
 * Deploys the execution to the previously assigned resource.
 *
 * @throws JobException if the execution cannot be deployed to the assigned resource
 */
public void deploy() throws JobException {
    assertRunningInJobMasterMainThread();
    final LogicalSlot slot = assignedResource;
    checkNotNull(slot, "In order to deploy the execution we first have to assign a resource via tryAssignResource.");
    // The more general check is the rpcTimeout of the deployment call
    if (!slot.isAlive()) {
        throw new JobException("Target slot (TaskManager) for deployment is no longer alive.");
    }
    // make sure exactly one deployment call happens from the correct state
    ExecutionState previous = this.state;
    if (previous == SCHEDULED) {
        if (!transitionState(previous, DEPLOYING)) {
            // this should actually not happen and indicates a race somewhere else
            throw new IllegalStateException("Cannot deploy task: Concurrent deployment call race.");
        }
    } else {
        // vertex may have been cancelled, or it was already scheduled
        throw new IllegalStateException("The vertex must be in SCHEDULED state to be deployed. Found state " + previous);
    }
    if (this != slot.getPayload()) {
        throw new IllegalStateException(String.format("The execution %s has not been assigned to the assigned slot.", this));
    }
    try {
        // race double check, did we fail/cancel and do we need to release the slot?
        if (this.state != DEPLOYING) {
            slot.releaseSlot(new FlinkException("Actual state of execution " + this + " (" + state + ") does not match expected state DEPLOYING."));
            return;
        }
        LOG.info("Deploying {} (attempt #{}) with attempt id {} and vertex id {} to {} with allocation id {}", vertex.getTaskNameWithSubtaskIndex(), attemptNumber, vertex.getCurrentExecutionAttempt().getAttemptId(), vertex.getID(), getAssignedResourceLocation(), slot.getAllocationId());
        final TaskDeploymentDescriptor deployment = TaskDeploymentDescriptorFactory.fromExecutionVertex(vertex, attemptNumber).createDeploymentDescriptor(slot.getAllocationId(), taskRestore, producedPartitions.values());
        // null taskRestore to let it be GC'ed
        taskRestore = null;
        final TaskManagerGateway taskManagerGateway = slot.getTaskManagerGateway();
        final ComponentMainThreadExecutor jobMasterMainThreadExecutor = vertex.getExecutionGraphAccessor().getJobMasterMainThreadExecutor();
        getVertex().notifyPendingDeployment(this);
        // We run the submission in the future executor so that the serialization of large TDDs
        // does not block
        // the main thread and sync back to the main thread once submission is completed.
        CompletableFuture.supplyAsync(() -> taskManagerGateway.submitTask(deployment, rpcTimeout), executor).thenCompose(Function.identity()).whenCompleteAsync((ack, failure) -> {
            if (failure == null) {
                vertex.notifyCompletedDeployment(this);
            } else {
                final Throwable actualFailure = ExceptionUtils.stripCompletionException(failure);
                if (actualFailure instanceof TimeoutException) {
                    String taskname = vertex.getTaskNameWithSubtaskIndex() + " (" + attemptId + ')';
                    markFailed(new Exception("Cannot deploy task " + taskname + " - TaskManager (" + getAssignedResourceLocation() + ") not responding after a rpcTimeout of " + rpcTimeout, actualFailure));
                } else {
                    markFailed(actualFailure);
                }
            }
        }, jobMasterMainThreadExecutor);
    } catch (Throwable t) {
        markFailed(t);
    }
}
Also used : JobException(org.apache.flink.runtime.JobException) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) TaskManagerGateway(org.apache.flink.runtime.jobmanager.slots.TaskManagerGateway) TaskDeploymentDescriptor(org.apache.flink.runtime.deployment.TaskDeploymentDescriptor) LogicalSlot(org.apache.flink.runtime.jobmaster.LogicalSlot) FlinkException(org.apache.flink.util.FlinkException) TimeoutException(java.util.concurrent.TimeoutException) TaskNotRunningException(org.apache.flink.runtime.operators.coordination.TaskNotRunningException) FlinkException(org.apache.flink.util.FlinkException) JobException(org.apache.flink.runtime.JobException) TimeoutException(java.util.concurrent.TimeoutException)

Example 9 with ComponentMainThreadExecutor

use of org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor in project flink by apache.

the class OperatorCoordinatorSchedulerTest method setupTestJobAndScheduler.

private DefaultScheduler setupTestJobAndScheduler(OperatorCoordinator.Provider provider, @Nullable TaskExecutorOperatorEventGateway taskExecutorOperatorEventGateway, @Nullable Consumer<JobGraph> jobGraphPreProcessing, boolean restartAllOnFailover) throws Exception {
    final OperatorIDPair opIds = OperatorIDPair.of(new OperatorID(), provider.getOperatorId());
    final JobVertex vertex = new JobVertex("Vertex with OperatorCoordinator", testVertexId, Collections.singletonList(opIds));
    vertex.setInvokableClass(NoOpInvokable.class);
    vertex.addOperatorCoordinator(new SerializedValue<>(provider));
    vertex.setParallelism(2);
    final JobGraph jobGraph = JobGraphBuilder.newStreamingJobGraphBuilder().addJobVertex(vertex).build();
    SchedulerTestingUtils.enableCheckpointing(jobGraph);
    if (jobGraphPreProcessing != null) {
        jobGraphPreProcessing.accept(jobGraph);
    }
    final ComponentMainThreadExecutor mainThreadExecutor = new ComponentMainThreadExecutorServiceAdapter((ScheduledExecutorService) executor, Thread.currentThread());
    final SchedulerTestingUtils.DefaultSchedulerBuilder schedulerBuilder = taskExecutorOperatorEventGateway == null ? SchedulerTestingUtils.createSchedulerBuilder(jobGraph, mainThreadExecutor) : SchedulerTestingUtils.createSchedulerBuilder(jobGraph, mainThreadExecutor, taskExecutorOperatorEventGateway);
    if (restartAllOnFailover) {
        schedulerBuilder.setFailoverStrategyFactory(new RestartAllFailoverStrategy.Factory());
    }
    final DefaultScheduler scheduler = schedulerBuilder.setFutureExecutor(executor).setDelayExecutor(executor).build();
    this.createdScheduler = scheduler;
    return scheduler;
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) RestartAllFailoverStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.RestartAllFailoverStrategy) ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) SchedulerTestingUtils(org.apache.flink.runtime.scheduler.SchedulerTestingUtils) DefaultScheduler(org.apache.flink.runtime.scheduler.DefaultScheduler) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Example 10 with ComponentMainThreadExecutor

use of org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor in project flink by apache.

the class OperatorCoordinatorHolderTest method checkpointEventValueAtomicity.

private void checkpointEventValueAtomicity(final Function<OperatorCoordinator.Context, OperatorCoordinator> coordinatorCtor) throws Exception {
    final ManuallyTriggeredScheduledExecutorService executor = new ManuallyTriggeredScheduledExecutorService();
    final ComponentMainThreadExecutor mainThreadExecutor = new ComponentMainThreadExecutorServiceAdapter((ScheduledExecutorService) executor, Thread.currentThread());
    final EventReceivingTasks sender = EventReceivingTasks.createForRunningTasks();
    final OperatorCoordinatorHolder holder = createCoordinatorHolder(sender, coordinatorCtor, mainThreadExecutor);
    // give the coordinator some time to emit some events. This isn't strictly necessary,
    // but it randomly alters the timings between the coordinator's thread (event sender) and
    // the main thread (holder). This should produce a flaky test if we missed some corner
    // cases.
    Thread.sleep(new Random().nextInt(10));
    executor.triggerAll();
    // trigger the checkpoint - this should also shut the valve as soon as the future is
    // completed
    final CompletableFuture<byte[]> checkpointFuture = new CompletableFuture<>();
    holder.checkpointCoordinator(0L, checkpointFuture);
    executor.triggerAll();
    // give the coordinator some time to emit some events. Same as above, this adds some
    // randomization
    Thread.sleep(new Random().nextInt(10));
    holder.close();
    executor.triggerAll();
    assertTrue(checkpointFuture.isDone());
    final int checkpointedNumber = bytesToInt(checkpointFuture.get());
    assertEquals(checkpointedNumber, sender.getNumberOfSentEvents());
    for (int i = 0; i < checkpointedNumber; i++) {
        assertEquals(i, ((TestOperatorEvent) sender.getAllSentEvents().get(i).event).getValue());
    }
}
Also used : ManuallyTriggeredScheduledExecutorService(org.apache.flink.runtime.concurrent.ManuallyTriggeredScheduledExecutorService) CompletableFuture(java.util.concurrent.CompletableFuture) ComponentMainThreadExecutor(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor) Random(java.util.Random) ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter)

Aggregations

ComponentMainThreadExecutor (org.apache.flink.runtime.concurrent.ComponentMainThreadExecutor)11 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)5 LogicalSlot (org.apache.flink.runtime.jobmaster.LogicalSlot)5 CompletableFuture (java.util.concurrent.CompletableFuture)4 FlinkException (org.apache.flink.util.FlinkException)4 Test (org.junit.Test)4 ScheduledExecutorService (java.util.concurrent.ScheduledExecutorService)3 ComponentMainThreadExecutorServiceAdapter (org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter)3 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)3 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)3 Duration (java.time.Duration)2 ArrayList (java.util.ArrayList)2 Collections (java.util.Collections)2 Iterator (java.util.Iterator)2 List (java.util.List)2 ScheduledFuture (java.util.concurrent.ScheduledFuture)2 TimeUnit (java.util.concurrent.TimeUnit)2 TimeoutException (java.util.concurrent.TimeoutException)2 JobID (org.apache.flink.api.common.JobID)2 JobStatus (org.apache.flink.api.common.JobStatus)2