Search in sources :

Example 1 with SchedulerBase

use of org.apache.flink.runtime.scheduler.SchedulerBase in project flink by apache.

the class DefaultSchedulerCheckpointCoordinatorTest method testClosingSchedulerSuspendsExecutionGraphAndShutsDownCheckpointCoordinator.

/**
 * Tests that the checkpoint coordinator is shut down if the execution graph is suspended.
 */
@Test
public void testClosingSchedulerSuspendsExecutionGraphAndShutsDownCheckpointCoordinator() throws Exception {
    final CompletableFuture<JobStatus> counterShutdownFuture = new CompletableFuture<>();
    CheckpointIDCounter counter = TestingCheckpointIDCounter.createStoreWithShutdownCheckAndNoStartAction(counterShutdownFuture);
    final CompletableFuture<JobStatus> storeShutdownFuture = new CompletableFuture<>();
    CompletedCheckpointStore store = TestingCompletedCheckpointStore.createStoreWithShutdownCheckAndNoCompletedCheckpoints(storeShutdownFuture);
    final SchedulerBase scheduler = createSchedulerAndEnableCheckpointing(counter, store);
    final ExecutionGraph graph = scheduler.getExecutionGraph();
    final CheckpointCoordinator checkpointCoordinator = graph.getCheckpointCoordinator();
    assertThat(checkpointCoordinator, Matchers.notNullValue());
    assertThat(checkpointCoordinator.isShutdown(), is(false));
    scheduler.closeAsync().get();
    assertThat(graph.getState(), is(JobStatus.SUSPENDED));
    assertThat(checkpointCoordinator.isShutdown(), is(true));
    assertThat(counterShutdownFuture.get(), is(JobStatus.SUSPENDED));
    assertThat(storeShutdownFuture.get(), is(JobStatus.SUSPENDED));
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) Test(org.junit.Test)

Example 2 with SchedulerBase

use of org.apache.flink.runtime.scheduler.SchedulerBase in project flink by apache.

the class DefaultSchedulerCheckpointCoordinatorTest method testClosingSchedulerShutsDownCheckpointCoordinatorOnFailedExecutionGraph.

/**
 * Tests that the checkpoint coordinator is shut down if the execution graph is failed.
 */
@Test
public void testClosingSchedulerShutsDownCheckpointCoordinatorOnFailedExecutionGraph() throws Exception {
    final CompletableFuture<JobStatus> counterShutdownFuture = new CompletableFuture<>();
    CheckpointIDCounter counter = TestingCheckpointIDCounter.createStoreWithShutdownCheckAndNoStartAction(counterShutdownFuture);
    final CompletableFuture<JobStatus> storeShutdownFuture = new CompletableFuture<>();
    CompletedCheckpointStore store = TestingCompletedCheckpointStore.createStoreWithShutdownCheckAndNoCompletedCheckpoints(storeShutdownFuture);
    final SchedulerBase scheduler = createSchedulerAndEnableCheckpointing(counter, store);
    final ExecutionGraph graph = scheduler.getExecutionGraph();
    final CheckpointCoordinator checkpointCoordinator = graph.getCheckpointCoordinator();
    assertThat(checkpointCoordinator, Matchers.notNullValue());
    assertThat(checkpointCoordinator.isShutdown(), is(false));
    graph.failJob(new Exception("Test Exception"), System.currentTimeMillis());
    scheduler.closeAsync().get();
    assertThat(checkpointCoordinator.isShutdown(), is(true));
    assertThat(counterShutdownFuture.get(), is(JobStatus.FAILED));
    assertThat(storeShutdownFuture.get(), is(JobStatus.FAILED));
}
Also used : JobStatus(org.apache.flink.api.common.JobStatus) CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) Test(org.junit.Test)

Example 3 with SchedulerBase

use of org.apache.flink.runtime.scheduler.SchedulerBase in project flink by apache.

the class DefaultExecutionGraphDeploymentTest method testNoResourceAvailableFailure.

/**
 * Tests that a blocking batch job fails if there are not enough resources left to schedule the
 * succeeding tasks. This test case is related to [FLINK-4296] where finished producing tasks
 * swallow the fail exception when scheduling a consumer task.
 */
@Test
public void testNoResourceAvailableFailure() throws Exception {
    JobVertex v1 = new JobVertex("source");
    JobVertex v2 = new JobVertex("sink");
    int dop1 = 2;
    int dop2 = 2;
    v1.setParallelism(dop1);
    v2.setParallelism(dop2);
    v1.setInvokableClass(BatchTask.class);
    v2.setInvokableClass(BatchTask.class);
    v2.connectNewDataSetAsInput(v1, DistributionPattern.POINTWISE, ResultPartitionType.BLOCKING);
    final JobGraph graph = JobGraphTestUtils.batchJobGraph(v1, v2);
    DirectScheduledExecutorService directExecutor = new DirectScheduledExecutorService();
    // execution graph that executes actions synchronously
    final SchedulerBase scheduler = SchedulerTestingUtils.newSchedulerBuilder(graph, ComponentMainThreadExecutorServiceAdapter.forMainThread()).setExecutionSlotAllocatorFactory(SchedulerTestingUtils.newSlotSharingExecutionSlotAllocatorFactory(TestingPhysicalSlotProvider.createWithLimitedAmountOfPhysicalSlots(1))).setFutureExecutor(directExecutor).setBlobWriter(blobWriter).build();
    final ExecutionGraph eg = scheduler.getExecutionGraph();
    checkJobOffloaded((DefaultExecutionGraph) eg);
    // schedule, this triggers mock deployment
    scheduler.startScheduling();
    ExecutionAttemptID attemptID = eg.getJobVertex(v1.getID()).getTaskVertices()[0].getCurrentExecutionAttempt().getAttemptId();
    scheduler.updateTaskExecutionState(new TaskExecutionState(attemptID, ExecutionState.RUNNING));
    scheduler.updateTaskExecutionState(new TaskExecutionState(attemptID, ExecutionState.FINISHED, null));
    assertEquals(JobStatus.FAILED, eg.getState());
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) DirectScheduledExecutorService(org.apache.flink.runtime.testutils.DirectScheduledExecutorService) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) Test(org.junit.Test)

Example 4 with SchedulerBase

use of org.apache.flink.runtime.scheduler.SchedulerBase in project flink by apache.

the class DefaultExecutionGraphDeploymentTest method testRegistrationOfExecutionsCanceled.

@Test
public void testRegistrationOfExecutionsCanceled() {
    try {
        final JobVertexID jid1 = new JobVertexID();
        final JobVertexID jid2 = new JobVertexID();
        JobVertex v1 = new JobVertex("v1", jid1);
        JobVertex v2 = new JobVertex("v2", jid2);
        SchedulerBase scheduler = setupScheduler(v1, 19, v2, 37);
        Collection<Execution> executions = new ArrayList<>(scheduler.getExecutionGraph().getRegisteredExecutions().values());
        for (Execution e : executions) {
            e.cancel();
            e.completeCancelling();
        }
        assertEquals(0, scheduler.getExecutionGraph().getRegisteredExecutions().size());
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ArrayList(java.util.ArrayList) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) Test(org.junit.Test)

Example 5 with SchedulerBase

use of org.apache.flink.runtime.scheduler.SchedulerBase in project flink by apache.

the class ExecutionGraphPartitionReleaseTest method testStrategyNotifiedOfUnFinishedVertices.

@Test
public void testStrategyNotifiedOfUnFinishedVertices() throws Exception {
    // setup a pipeline of 2 failover regions (f1 -> f2), where
    // f1 is just a source
    // f2 consists of 3 operators (o1,o2,o3), where o1 consumes f1, and o2/o3 consume o1
    final JobVertex sourceVertex = ExecutionGraphTestUtils.createNoOpVertex("source", 1);
    final JobVertex operator1Vertex = ExecutionGraphTestUtils.createNoOpVertex("operator1", 1);
    final JobVertex operator2Vertex = ExecutionGraphTestUtils.createNoOpVertex("operator2", 1);
    final JobVertex operator3Vertex = ExecutionGraphTestUtils.createNoOpVertex("operator3", 1);
    operator1Vertex.connectNewDataSetAsInput(sourceVertex, DistributionPattern.POINTWISE, ResultPartitionType.BLOCKING);
    operator2Vertex.connectNewDataSetAsInput(operator1Vertex, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED);
    operator3Vertex.connectNewDataSetAsInput(operator1Vertex, DistributionPattern.ALL_TO_ALL, ResultPartitionType.PIPELINED);
    // setup partition tracker to intercept partition release calls
    final TestingJobMasterPartitionTracker partitionTracker = new TestingJobMasterPartitionTracker();
    final Queue<ResultPartitionID> releasedPartitions = new ArrayDeque<>();
    partitionTracker.setStopTrackingAndReleasePartitionsConsumer(partitionIds -> releasedPartitions.add(partitionIds.iterator().next()));
    final SchedulerBase scheduler = createScheduler(partitionTracker, sourceVertex, operator1Vertex, operator2Vertex, operator3Vertex);
    final ExecutionGraph executionGraph = scheduler.getExecutionGraph();
    mainThreadExecutor.execute(() -> {
        final Execution sourceExecution = getCurrentExecution(sourceVertex, executionGraph);
        // finish the source; this should not result in any release calls since the
        // consumer o1 was not finished
        scheduler.updateTaskExecutionState(new TaskExecutionState(sourceExecution.getAttemptId(), ExecutionState.FINISHED));
        assertThat(releasedPartitions, empty());
    });
    mainThreadExecutor.execute(() -> {
        final Execution operator1Execution = getCurrentExecution(operator1Vertex, executionGraph);
        // release calls since not all operators of the pipelined region are finished
        for (final IntermediateResultPartitionID partitionId : operator1Execution.getVertex().getProducedPartitions().keySet()) {
            scheduler.notifyPartitionDataAvailable(new ResultPartitionID(partitionId, operator1Execution.getAttemptId()));
        }
        scheduler.updateTaskExecutionState(new TaskExecutionState(operator1Execution.getAttemptId(), ExecutionState.FINISHED));
        assertThat(releasedPartitions, empty());
    });
    mainThreadExecutor.execute(() -> {
        final Execution operator2Execution = getCurrentExecution(operator2Vertex, executionGraph);
        // finish o2; this should not result in any release calls since o3 was not
        // finished
        scheduler.updateTaskExecutionState(new TaskExecutionState(operator2Execution.getAttemptId(), ExecutionState.FINISHED));
        assertThat(releasedPartitions, empty());
    });
    mainThreadExecutor.execute(() -> {
        final Execution operator2Execution = getCurrentExecution(operator2Vertex, executionGraph);
        // reset o2
        operator2Execution.getVertex().resetForNewExecution();
        assertThat(releasedPartitions, empty());
    });
    mainThreadExecutor.execute(() -> {
        final Execution operator3Execution = getCurrentExecution(operator3Vertex, executionGraph);
        // finish o3; this should not result in any release calls since o2 was reset
        scheduler.updateTaskExecutionState(new TaskExecutionState(operator3Execution.getAttemptId(), ExecutionState.FINISHED));
        assertThat(releasedPartitions, empty());
    });
}
Also used : JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) TestingJobMasterPartitionTracker(org.apache.flink.runtime.io.network.partition.TestingJobMasterPartitionTracker) SchedulerBase(org.apache.flink.runtime.scheduler.SchedulerBase) IntermediateResultPartitionID(org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID) ResultPartitionID(org.apache.flink.runtime.io.network.partition.ResultPartitionID) ArrayDeque(java.util.ArrayDeque) TaskExecutionState(org.apache.flink.runtime.taskmanager.TaskExecutionState) IntermediateResultPartitionID(org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID) Test(org.junit.Test)

Aggregations

SchedulerBase (org.apache.flink.runtime.scheduler.SchedulerBase)56 Test (org.junit.Test)49 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)33 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)19 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)10 CompletableFuture (java.util.concurrent.CompletableFuture)8 IOException (java.io.IOException)7 TestingPhysicalSlotProvider (org.apache.flink.runtime.scheduler.TestingPhysicalSlotProvider)7 TestRestartBackoffTimeStrategy (org.apache.flink.runtime.executiongraph.failover.flip1.TestRestartBackoffTimeStrategy)6 TaskExecutionState (org.apache.flink.runtime.taskmanager.TaskExecutionState)6 ArrayList (java.util.ArrayList)5 JobStatus (org.apache.flink.api.common.JobStatus)5 IntermediateResultPartitionID (org.apache.flink.runtime.jobgraph.IntermediateResultPartitionID)5 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)4 ResultPartitionID (org.apache.flink.runtime.io.network.partition.ResultPartitionID)4 SlotPool (org.apache.flink.runtime.jobmaster.slotpool.SlotPool)4 TestingPhysicalSlot (org.apache.flink.runtime.scheduler.TestingPhysicalSlot)4 VertexParallelismInformation (org.apache.flink.runtime.scheduler.VertexParallelismInformation)4 VertexParallelismStore (org.apache.flink.runtime.scheduler.VertexParallelismStore)4 TaskManagerLocation (org.apache.flink.runtime.taskmanager.TaskManagerLocation)4