Search in sources :

Example 31 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTest method testMinCheckpointPause.

@Test
public void testMinCheckpointPause() throws Exception {
    // will use a different thread to allow checkpoint triggering before exiting from
    // receiveAcknowledgeMessage
    ScheduledExecutorService executorService = Executors.newSingleThreadScheduledExecutor();
    CheckpointCoordinator coordinator = null;
    try {
        int pause = 1000;
        JobVertexID jobVertexId = new JobVertexID();
        ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexId).setMainThreadExecutor(ComponentMainThreadExecutorServiceAdapter.forSingleThreadExecutor(new DirectScheduledExecutorService())).build();
        ExecutionVertex vertex = graph.getJobVertex(jobVertexId).getTaskVertices()[0];
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        coordinator = new CheckpointCoordinatorBuilder().setTimer(new ScheduledExecutorServiceAdapter(executorService)).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setCheckpointInterval(pause).setCheckpointTimeout(Long.MAX_VALUE).setMaxConcurrentCheckpoints(1).setMinPauseBetweenCheckpoints(pause).build()).setExecutionGraph(graph).build();
        coordinator.startCheckpointScheduler();
        coordinator.triggerCheckpoint(// trigger, execute, and later complete by receiveAcknowledgeMessage
        true);
        coordinator.triggerCheckpoint(// enqueue and later see if it gets executed in the middle of
        true);
        // receiveAcknowledgeMessage
        while (coordinator.getNumberOfPendingCheckpoints() == 0) {
            // wait for at least 1 request to be fully processed
            Thread.sleep(10);
        }
        coordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptId, 1L), TASK_MANAGER_LOCATION_INFO);
        Thread.sleep(pause / 2);
        assertEquals(0, coordinator.getNumberOfPendingCheckpoints());
        // make sure that the 2nd request is eventually processed
        while (coordinator.getNumberOfPendingCheckpoints() == 0) {
            Thread.sleep(1);
        }
    } finally {
        if (coordinator != null) {
            coordinator.shutdown();
        }
        executorService.shutdownNow();
    }
}
Also used : DirectScheduledExecutorService(org.apache.flink.runtime.testutils.DirectScheduledExecutorService) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) DirectScheduledExecutorService(org.apache.flink.runtime.testutils.DirectScheduledExecutorService) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ScheduledExecutorServiceAdapter(org.apache.flink.util.concurrent.ScheduledExecutorServiceAdapter) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 32 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTest method testTriggerAndDeclineCheckpointSimple.

/**
 * This test triggers a checkpoint and then sends a decline checkpoint message from one of the
 * tasks. The expected behaviour is that said checkpoint is discarded and a new checkpoint is
 * triggered.
 */
private void testTriggerAndDeclineCheckpointSimple(CheckpointFailureReason checkpointFailureReason) throws Exception {
    final CheckpointException checkpointException = new CheckpointException(checkpointFailureReason);
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    TestFailJobCallback failJobCallback = new TestFailJobCallback();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setAlignedCheckpointTimeout(Long.MAX_VALUE).setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCheckpointFailureManager(new CheckpointFailureManager(0, failJobCallback)).build();
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    // trigger the first checkpoint. this should succeed
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
    // validate that we have a pending checkpoint
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    // we have one task scheduled that will cancel after timeout
    assertEquals(1, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
    long checkpointId = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
    PendingCheckpoint checkpoint = checkpointCoordinator.getPendingCheckpoints().get(checkpointId);
    assertNotNull(checkpoint);
    assertEquals(checkpointId, checkpoint.getCheckpointId());
    assertEquals(graph.getJobID(), checkpoint.getJobId());
    assertEquals(2, checkpoint.getNumberOfNonAcknowledgedTasks());
    assertEquals(0, checkpoint.getNumberOfAcknowledgedTasks());
    assertEquals(0, checkpoint.getOperatorStates().size());
    assertFalse(checkpoint.isDisposed());
    assertFalse(checkpoint.areTasksFullyAcknowledged());
    // check that the vertices received the trigger checkpoint message
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        CheckpointCoordinatorTestingUtils.TriggeredCheckpoint triggeredCheckpoint = gateway.getOnlyTriggeredCheckpoint(vertex.getCurrentExecutionAttempt().getAttemptId());
        assertEquals(checkpointId, triggeredCheckpoint.checkpointId);
        assertEquals(checkpoint.getCheckpointTimestamp(), triggeredCheckpoint.timestamp);
        assertEquals(CheckpointOptions.forCheckpointWithDefaultLocation(), triggeredCheckpoint.checkpointOptions);
    }
    // acknowledge from one of the tasks
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId), "Unknown location");
    assertEquals(1, checkpoint.getNumberOfAcknowledgedTasks());
    assertEquals(1, checkpoint.getNumberOfNonAcknowledgedTasks());
    assertFalse(checkpoint.isDisposed());
    assertFalse(checkpoint.areTasksFullyAcknowledged());
    // acknowledge the same task again (should not matter)
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId), "Unknown location");
    assertFalse(checkpoint.isDisposed());
    assertFalse(checkpoint.areTasksFullyAcknowledged());
    // decline checkpoint from the other task, this should cancel the checkpoint
    // and trigger a new one
    checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, checkpointException), TASK_MANAGER_LOCATION_INFO);
    assertTrue(checkpoint.isDisposed());
    // the canceler is also removed
    assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
    // validate that we have no new pending checkpoint
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    // decline again, nothing should happen
    // decline from the other task, nothing should happen
    checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID1, checkpointId, checkpointException), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveDeclineMessage(new DeclineCheckpoint(graph.getJobID(), attemptID2, checkpointId, checkpointException), TASK_MANAGER_LOCATION_INFO);
    assertTrue(checkpoint.isDisposed());
    assertEquals(1, failJobCallback.getInvokeCounter());
    checkpointCoordinator.shutdown();
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph)

Example 33 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTest method testTriggerAndConfirmSimpleSavepoint.

@Test
public void testTriggerAndConfirmSimpleSavepoint() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(graph);
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    // trigger the first checkpoint. this should succeed
    String savepointDir = tmpFolder.newFolder().getAbsolutePath();
    CompletableFuture<CompletedCheckpoint> savepointFuture = checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertFalse(savepointFuture.isDone());
    // validate that we have a pending savepoint
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    long checkpointId = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
    PendingCheckpoint pending = checkpointCoordinator.getPendingCheckpoints().get(checkpointId);
    assertNotNull(pending);
    assertEquals(checkpointId, pending.getCheckpointId());
    assertEquals(graph.getJobID(), pending.getJobId());
    assertEquals(2, pending.getNumberOfNonAcknowledgedTasks());
    assertEquals(0, pending.getNumberOfAcknowledgedTasks());
    assertEquals(0, pending.getOperatorStates().size());
    assertFalse(pending.isDisposed());
    assertFalse(pending.areTasksFullyAcknowledged());
    assertFalse(pending.canBeSubsumed());
    OperatorID opID1 = OperatorID.fromJobVertexID(vertex1.getJobvertexId());
    OperatorID opID2 = OperatorID.fromJobVertexID(vertex2.getJobvertexId());
    OperatorSubtaskState subtaskState1 = mock(OperatorSubtaskState.class);
    OperatorSubtaskState subtaskState2 = mock(OperatorSubtaskState.class);
    TaskStateSnapshot taskOperatorSubtaskStates1 = new TaskStateSnapshot(singletonMap(opID1, subtaskState1));
    TaskStateSnapshot taskOperatorSubtaskStates2 = new TaskStateSnapshot(singletonMap(opID2, subtaskState2));
    // acknowledge from one of the tasks
    AcknowledgeCheckpoint acknowledgeCheckpoint2 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates2);
    checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
    assertEquals(1, pending.getNumberOfAcknowledgedTasks());
    assertEquals(1, pending.getNumberOfNonAcknowledgedTasks());
    assertFalse(pending.isDisposed());
    assertFalse(pending.areTasksFullyAcknowledged());
    assertFalse(savepointFuture.isDone());
    // acknowledge the same task again (should not matter)
    checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
    assertFalse(pending.isDisposed());
    assertFalse(pending.areTasksFullyAcknowledged());
    assertFalse(savepointFuture.isDone());
    // acknowledge the other task.
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates1), TASK_MANAGER_LOCATION_INFO);
    // the checkpoint is internally converted to a successful checkpoint and the
    // pending checkpoint object is disposed
    assertTrue(pending.isDisposed());
    assertNotNull(savepointFuture.get());
    // the now we should have a completed checkpoint
    // savepoints should not registered as retained checkpoints
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    // validate that the relevant tasks got a confirmation message
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
        assertThat(gateway.getNotifiedCompletedCheckpoints(attemptId)).isEmpty();
    }
    CompletedCheckpoint success = savepointFuture.get();
    assertEquals(graph.getJobID(), success.getJobId());
    assertEquals(pending.getCheckpointId(), success.getCheckpointID());
    assertEquals(2, success.getOperatorStates().size());
    checkpointCoordinator.shutdown();
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 34 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTest method jobFailsIfInFlightSynchronousSavepointIsDiscarded.

@Test
public void jobFailsIfInFlightSynchronousSavepointIsDiscarded() throws Exception {
    final Tuple2<Integer, Throwable> invocationCounterAndException = Tuple2.of(0, null);
    final Throwable expectedRootCause = new IOException("Custom-Exception");
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    // set up the coordinator and validate the initial state
    final CheckpointCoordinator coordinator = getCheckpointCoordinator(graph, new CheckpointFailureManager(0, new CheckpointFailureManager.FailJobCallback() {

        @Override
        public void failJob(Throwable cause) {
            invocationCounterAndException.f0 += 1;
            invocationCounterAndException.f1 = cause;
        }

        @Override
        public void failJobDueToTaskFailure(Throwable cause, ExecutionAttemptID failingTask) {
            throw new AssertionError("This method should not be called for the test.");
        }
    }));
    final CompletableFuture<CompletedCheckpoint> savepointFuture = coordinator.triggerSynchronousSavepoint(false, "test-dir", SavepointFormatType.CANONICAL);
    manuallyTriggeredScheduledExecutor.triggerAll();
    final PendingCheckpoint syncSavepoint = declineSynchronousSavepoint(graph.getJobID(), coordinator, attemptID1, expectedRootCause);
    assertTrue(syncSavepoint.isDisposed());
    try {
        savepointFuture.get();
        fail("Expected Exception not found.");
    } catch (ExecutionException e) {
        final Throwable cause = ExceptionUtils.stripExecutionException(e);
        assertTrue(cause instanceof CheckpointException);
        assertEquals(expectedRootCause.getMessage(), cause.getCause().getCause().getMessage());
    }
    assertEquals(1L, invocationCounterAndException.f0.intValue());
    assertTrue(invocationCounterAndException.f1 instanceof CheckpointException && invocationCounterAndException.f1.getCause().getCause().getMessage().equals(expectedRootCause.getMessage()));
    coordinator.shutdown();
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) IOException(java.io.IOException) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) ExecutionException(java.util.concurrent.ExecutionException) Test(org.junit.Test)

Example 35 with ExecutionAttemptID

use of org.apache.flink.runtime.executiongraph.ExecutionAttemptID in project flink by apache.

the class CheckpointCoordinatorTest method testExternallyInducedSourceWithOperatorCoordinator.

/**
 * Test that the checkpoint still behave correctly when the task checkpoint is triggered by the
 * master hooks and finished before the master checkpoint. Also make sure that the operator
 * coordinators are checkpointed before starting the task checkpoint.
 */
@Test
public void testExternallyInducedSourceWithOperatorCoordinator() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    OperatorID opID2 = vertex2.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    TaskStateSnapshot taskOperatorSubtaskStates1 = new TaskStateSnapshot();
    TaskStateSnapshot taskOperatorSubtaskStates2 = new TaskStateSnapshot();
    OperatorSubtaskState subtaskState1 = OperatorSubtaskState.builder().build();
    OperatorSubtaskState subtaskState2 = OperatorSubtaskState.builder().build();
    taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);
    taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID2, subtaskState2);
    // Create a mock OperatorCoordinatorCheckpointContext which completes the checkpoint
    // immediately.
    AtomicBoolean coordCheckpointDone = new AtomicBoolean(false);
    OperatorCoordinatorCheckpointContext coordinatorCheckpointContext = new CheckpointCoordinatorTestingUtils.MockOperatorCheckpointCoordinatorContextBuilder().setOnCallingCheckpointCoordinator((checkpointId, result) -> {
        coordCheckpointDone.set(true);
        result.complete(new byte[0]);
    }).setOperatorID(opID1).build();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCoordinatorsToCheckpoint(Collections.singleton(coordinatorCheckpointContext)).build();
    AtomicReference<Long> checkpointIdRef = new AtomicReference<>();
    // Add a master hook which triggers and acks the task checkpoint immediately.
    // In this case the task checkpoints would complete before the job master checkpoint
    // completes.
    checkpointCoordinator.addMasterHook(new MasterTriggerRestoreHook<Integer>() {

        @Override
        public String getIdentifier() {
            return "anything";
        }

        @Override
        @Nullable
        public CompletableFuture<Integer> triggerCheckpoint(long checkpointId, long timestamp, Executor executor) throws Exception {
            assertTrue("The coordinator checkpoint should have finished.", coordCheckpointDone.get());
            // Acknowledge the checkpoint in the master hooks so the task snapshots
            // complete before
            // the master state snapshot completes.
            checkpointIdRef.set(checkpointId);
            AcknowledgeCheckpoint acknowledgeCheckpoint1 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates1);
            AcknowledgeCheckpoint acknowledgeCheckpoint2 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates2);
            checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint1, TASK_MANAGER_LOCATION_INFO);
            checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
            return null;
        }

        @Override
        public void restoreCheckpoint(long checkpointId, Integer checkpointData) throws Exception {
        }

        @Override
        public SimpleVersionedSerializer<Integer> createCheckpointDataSerializer() {
            return new SimpleVersionedSerializer<Integer>() {

                @Override
                public int getVersion() {
                    return 0;
                }

                @Override
                public byte[] serialize(Integer obj) throws IOException {
                    return new byte[0];
                }

                @Override
                public Integer deserialize(int version, byte[] serialized) throws IOException {
                    return 1;
                }
            };
        }
    });
    // Verify initial state.
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
    // trigger the first checkpoint. this should succeed
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
    // now we should have a completed checkpoint
    assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    // the canceler should be removed now
    assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
    // validate that the relevant tasks got a confirmation message
    long checkpointId = checkpointIdRef.get();
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
    }
    CompletedCheckpoint success = checkpointCoordinator.getSuccessfulCheckpoints().get(0);
    assertEquals(graph.getJobID(), success.getJobId());
    assertEquals(2, success.getOperatorStates().size());
    checkpointCoordinator.shutdown();
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CompletableFuture(java.util.concurrent.CompletableFuture) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Executor(java.util.concurrent.Executor) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) SimpleVersionedSerializer(org.apache.flink.core.io.SimpleVersionedSerializer) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) TriFunctionWithException(org.apache.flink.util.function.TriFunctionWithException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) RpcException(org.apache.flink.runtime.rpc.exceptions.RpcException) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) AtomicLong(java.util.concurrent.atomic.AtomicLong) ArgumentMatchers.anyLong(org.mockito.ArgumentMatchers.anyLong) Nullable(javax.annotation.Nullable) Test(org.junit.Test)

Aggregations

ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)233 Test (org.junit.Test)176 JobID (org.apache.flink.api.common.JobID)111 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)92 Configuration (org.apache.flink.configuration.Configuration)56 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)56 IOException (java.io.IOException)51 CompletableFuture (java.util.concurrent.CompletableFuture)43 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)38 TaskDeploymentDescriptor (org.apache.flink.runtime.deployment.TaskDeploymentDescriptor)38 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)36 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)35 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)35 ResourceID (org.apache.flink.runtime.clusterframework.types.ResourceID)34 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)34 ExecutionException (java.util.concurrent.ExecutionException)29 ArrayList (java.util.ArrayList)27 AllocationID (org.apache.flink.runtime.clusterframework.types.AllocationID)27 IntermediateDataSetID (org.apache.flink.runtime.jobgraph.IntermediateDataSetID)27 ExecutionState (org.apache.flink.runtime.execution.ExecutionState)26