Search in sources :

Example 36 with ExecutionGraph

use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.

the class CheckpointCoordinatorTest method testSavepointsAreNotSubsumed.

/**
 * Triggers a savepoint and two checkpoints. The second checkpoint completes and subsumes the
 * first checkpoint, but not the first savepoint. Then we trigger another checkpoint and
 * savepoint. The 2nd savepoint completes and subsumes the last checkpoint, but not the first
 * savepoint.
 */
@Test
public void testSavepointsAreNotSubsumed() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    StandaloneCheckpointIDCounter counter = new StandaloneCheckpointIDCounter();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = spy(new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setCheckpointIDCounter(counter).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(1)).setTimer(manuallyTriggeredScheduledExecutor).build());
    String savepointDir = tmpFolder.newFolder().getAbsolutePath();
    // Trigger savepoint and checkpoint
    CompletableFuture<CompletedCheckpoint> savepointFuture1 = checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL);
    manuallyTriggeredScheduledExecutor.triggerAll();
    long savepointId1 = counter.getLast();
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    CompletableFuture<CompletedCheckpoint> checkpointFuture1 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture1);
    CompletableFuture<CompletedCheckpoint> checkpointFuture2 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture2);
    long checkpointId2 = counter.getLast();
    assertEquals(3, checkpointCoordinator.getNumberOfPendingCheckpoints());
    // 2nd checkpoint should subsume the 1st checkpoint, but not the savepoint
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId2), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId2), TASK_MANAGER_LOCATION_INFO);
    // no completed checkpoint before checkpointId2.
    verify(checkpointCoordinator, times(1)).sendAcknowledgeMessages(anyList(), eq(checkpointId2), anyLong(), eq(INVALID_CHECKPOINT_ID));
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertFalse(checkpointCoordinator.getPendingCheckpoints().get(savepointId1).isDisposed());
    assertFalse(savepointFuture1.isDone());
    CompletableFuture<CompletedCheckpoint> checkpointFuture3 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture3);
    assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
    CompletableFuture<CompletedCheckpoint> savepointFuture2 = checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL);
    manuallyTriggeredScheduledExecutor.triggerAll();
    long savepointId2 = counter.getLast();
    FutureUtils.throwIfCompletedExceptionally(savepointFuture2);
    assertEquals(3, checkpointCoordinator.getNumberOfPendingCheckpoints());
    // savepoints should not subsume checkpoints
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, savepointId2), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, savepointId2), TASK_MANAGER_LOCATION_INFO);
    // we do not send notify checkpoint complete for savepoints
    verify(checkpointCoordinator, times(0)).sendAcknowledgeMessages(anyList(), eq(savepointId2), anyLong(), anyLong());
    assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertFalse(checkpointCoordinator.getPendingCheckpoints().get(savepointId1).isDisposed());
    assertFalse(savepointFuture1.isDone());
    assertNotNull(savepointFuture2.get());
    // Ack first savepoint
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, savepointId1), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, savepointId1), TASK_MANAGER_LOCATION_INFO);
    // we do not send notify checkpoint complete for savepoints
    verify(checkpointCoordinator, times(0)).sendAcknowledgeMessages(anyList(), eq(savepointId1), anyLong(), anyLong());
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertNotNull(savepointFuture1.get());
    CompletableFuture<CompletedCheckpoint> checkpointFuture4 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture4);
    long checkpointId4 = counter.getLast();
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId4), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId4), TASK_MANAGER_LOCATION_INFO);
    // checkpoint2 would be subsumed.
    verify(checkpointCoordinator, times(1)).sendAcknowledgeMessages(anyList(), eq(checkpointId4), anyLong(), eq(checkpointId2));
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 37 with ExecutionGraph

use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.

the class CheckpointCoordinatorTriggeringTest method testTriggeringFullCheckpoints.

@Test
public void testTriggeringFullCheckpoints() throws Exception {
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    JobVertexID jobVertexID = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
    ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
    // create a savepoint, we can restore from later
    final CompletedCheckpoint savepoint = takeSavepoint(graph, attemptID);
    // restore from a savepoint in NO_CLAIM mode
    final StandaloneCompletedCheckpointStore checkpointStore = new StandaloneCompletedCheckpointStore(1);
    final StandaloneCheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
    CheckpointCoordinator checkpointCoordinator = createCheckpointCoordinator(graph, checkpointStore, checkpointIDCounter);
    checkpointCoordinator.restoreSavepoint(SavepointRestoreSettings.forPath(savepoint.getExternalPointer(), true, RestoreMode.NO_CLAIM), graph.getAllVertices(), this.getClass().getClassLoader());
    // trigger a savepoint before any checkpoint completes
    // next triggered checkpoint should still be a full one
    takeSavepoint(graph, attemptID, checkpointCoordinator, 2);
    checkpointCoordinator.startCheckpointScheduler();
    gateway.resetCount();
    // the checkpoint should be a FULL_CHECKPOINT
    final CompletableFuture<CompletedCheckpoint> checkpoint = checkpointCoordinator.triggerCheckpoint(true);
    manuallyTriggeredScheduledExecutor.triggerAll();
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID, 3), TASK_MANAGER_LOCATION_INFO);
    checkpoint.get();
    assertThat(gateway.getOnlyTriggeredCheckpoint(attemptID).checkpointOptions.getCheckpointType(), is(CheckpointType.FULL_CHECKPOINT));
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 38 with ExecutionGraph

use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.

the class CheckpointCoordinatorTriggeringTest method testTriggerCheckpointSnapshotMasterHookFailed.

@Test
public void testTriggerCheckpointSnapshotMasterHookFailed() throws Exception {
    JobVertexID jobVertexID = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
    ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = createCheckpointCoordinator();
    final CompletableFuture<String> masterHookCheckpointFuture = new CompletableFuture<>();
    checkpointCoordinator.addMasterHook(new TestingMasterHook(masterHookCheckpointFuture));
    checkpointCoordinator.startCheckpointScheduler();
    final CompletableFuture<CompletedCheckpoint> onCompletionPromise = triggerPeriodicCheckpoint(checkpointCoordinator);
    // checkpoint trigger will not finish since master hook checkpoint is not finished yet
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertTrue(checkpointCoordinator.isTriggering());
    // continue triggering
    masterHookCheckpointFuture.completeExceptionally(new Exception("by design"));
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertFalse(checkpointCoordinator.isTriggering());
    try {
        onCompletionPromise.get();
        fail("Should not reach here");
    } catch (ExecutionException e) {
        final Optional<CheckpointException> checkpointExceptionOptional = ExceptionUtils.findThrowable(e, CheckpointException.class);
        assertTrue(checkpointExceptionOptional.isPresent());
        assertEquals(CheckpointFailureReason.TRIGGER_CHECKPOINT_FAILURE, checkpointExceptionOptional.get().getCheckpointFailureReason());
    }
    // it doesn't really trigger task manager to do checkpoint
    assertEquals(0, gateway.getTriggeredCheckpoints(attemptID).size());
    assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) Optional(java.util.Optional) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) ExecutionException(java.util.concurrent.ExecutionException) CompletableFuture(java.util.concurrent.CompletableFuture) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Example 39 with ExecutionGraph

use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.

the class CheckpointCoordinatorTriggeringTest method testTriggerCheckpointRequestQueuedWithFailure.

@Test
public void testTriggerCheckpointRequestQueuedWithFailure() throws Exception {
    JobVertexID jobVertexID = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
    ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointIDCounter(new UnstableCheckpointIDCounter(id -> id == 0)).setTimer(manuallyTriggeredScheduledExecutor).build();
    checkpointCoordinator.startCheckpointScheduler();
    // start a periodic checkpoint first
    final CompletableFuture<CompletedCheckpoint> onCompletionPromise1 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
    assertTrue(checkpointCoordinator.isTriggering());
    assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
    // another trigger before the prior one finished
    final CompletableFuture<CompletedCheckpoint> onCompletionPromise2 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
    // another trigger before the first one finished
    final CompletableFuture<CompletedCheckpoint> onCompletionPromise3 = triggerNonPeriodicCheckpoint(checkpointCoordinator);
    assertTrue(checkpointCoordinator.isTriggering());
    assertEquals(2, checkpointCoordinator.getTriggerRequestQueue().size());
    manuallyTriggeredScheduledExecutor.triggerAll();
    // the first triggered checkpoint fails by design through UnstableCheckpointIDCounter
    assertTrue(onCompletionPromise1.isCompletedExceptionally());
    assertFalse(onCompletionPromise2.isCompletedExceptionally());
    assertFalse(onCompletionPromise3.isCompletedExceptionally());
    assertFalse(checkpointCoordinator.isTriggering());
    assertEquals(0, checkpointCoordinator.getTriggerRequestQueue().size());
    assertEquals(2, gateway.getTriggeredCheckpoints(attemptID).size());
}
Also used : OneShotLatch(org.apache.flink.core.testutils.OneShotLatch) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ScheduledExecutorServiceAdapter(org.apache.flink.util.concurrent.ScheduledExecutorServiceAdapter) ExceptionUtils(org.apache.flink.util.ExceptionUtils) CompletableFuture(java.util.concurrent.CompletableFuture) JobStatus(org.apache.flink.api.common.JobStatus) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) HashSet(java.util.HashSet) CoreMatchers.instanceOf(org.hamcrest.CoreMatchers.instanceOf) Assert.assertThat(org.junit.Assert.assertThat) RestoreMode(org.apache.flink.runtime.jobgraph.RestoreMode) TestLogger(org.apache.flink.util.TestLogger) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) Assert.fail(org.junit.Assert.fail) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) Nullable(javax.annotation.Nullable) Before(org.junit.Before) Executor(java.util.concurrent.Executor) CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) Predicate(java.util.function.Predicate) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) Executors(java.util.concurrent.Executors) ExecutorUtils(org.apache.flink.util.ExecutorUtils) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) List(java.util.List) Rule(org.junit.Rule) SimpleVersionedSerializer(org.apache.flink.core.io.SimpleVersionedSerializer) Assert.assertFalse(org.junit.Assert.assertFalse) Optional(java.util.Optional) Matchers.is(org.hamcrest.Matchers.is) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) SavepointRestoreSettings(org.apache.flink.runtime.jobgraph.SavepointRestoreSettings) TemporaryFolder(org.junit.rules.TemporaryFolder) Assert.assertEquals(org.junit.Assert.assertEquals) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 40 with ExecutionGraph

use of org.apache.flink.runtime.executiongraph.ExecutionGraph in project flink by apache.

the class DefaultCheckpointPlanCalculatorTest method runWithNotRunningTask.

private void runWithNotRunningTask(boolean isRunningVertexSource, boolean isNotRunningVertexSource) throws Exception {
    for (ExecutionState notRunningState : complementOf(EnumSet.of(ExecutionState.RUNNING))) {
        JobVertexID runningVertex = new JobVertexID();
        JobVertexID notRunningVertex = new JobVertexID();
        ExecutionGraph graph = new CheckpointExecutionGraphBuilder().addJobVertex(runningVertex, isRunningVertexSource).addJobVertex(notRunningVertex, isNotRunningVertexSource).setTransitToRunning(false).build();
        // The first vertex is always RUNNING.
        transitVertexToState(graph, runningVertex, ExecutionState.RUNNING);
        // The second vertex is everything except RUNNING.
        transitVertexToState(graph, notRunningVertex, notRunningState);
        DefaultCheckpointPlanCalculator checkpointPlanCalculator = createCheckpointPlanCalculator(graph);
        try {
            checkpointPlanCalculator.calculateCheckpointPlan().get();
            fail("The computation should fail since some tasks to trigger are in " + notRunningState + " state");
        } catch (ExecutionException e) {
            Throwable cause = e.getCause();
            assertThat(cause, instanceOf(CheckpointException.class));
            assertEquals(CheckpointFailureReason.NOT_ALL_REQUIRED_TASKS_RUNNING, ((CheckpointException) cause).getCheckpointFailureReason());
        }
    }
}
Also used : ExecutionState(org.apache.flink.runtime.execution.ExecutionState) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) ExecutionException(java.util.concurrent.ExecutionException) CheckpointExecutionGraphBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder)

Aggregations

ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)120 Test (org.junit.Test)96 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)77 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)53 CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)40 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)36 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)35 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)31 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)24 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)24 HashMap (java.util.HashMap)20 CompletableFuture (java.util.concurrent.CompletableFuture)19 JobID (org.apache.flink.api.common.JobID)19 ArrayList (java.util.ArrayList)17 HashSet (java.util.HashSet)17 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)17 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)17 ExecutionException (java.util.concurrent.ExecutionException)13 Executor (java.util.concurrent.Executor)13 IOException (java.io.IOException)12