Search in sources :

Example 1 with CheckpointCoordinatorConfigurationBuilder

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder in project flink by apache.

the class CheckpointCoordinatorRestoringTest method testRestoreFinishedStateWithoutInFlightData.

@Test
public void testRestoreFinishedStateWithoutInFlightData() throws Exception {
    // given: Operator with not empty states.
    OperatorIDPair op1 = OperatorIDPair.generatedIDOnly(new OperatorID());
    final JobVertexID jobVertexID = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, 1, 1, singletonList(op1), true).build();
    CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
    Map<OperatorID, OperatorState> operatorStates = new HashMap<>();
    operatorStates.put(op1.getGeneratedOperatorID(), new FullyFinishedOperatorState(op1.getGeneratedOperatorID(), 1, 1));
    CompletedCheckpoint completedCheckpoint = new CompletedCheckpoint(graph.getJobID(), 2, System.currentTimeMillis(), System.currentTimeMillis() + 3000, operatorStates, Collections.emptyList(), CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new TestCompletedCheckpointStorageLocation());
    completedCheckpointStore.addCheckpointAndSubsumeOldestOne(completedCheckpoint, new CheckpointsCleaner(), () -> {
    });
    CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(new CheckpointCoordinatorConfigurationBuilder().setCheckpointIdOfIgnoredInFlightData(2).build()).setCompletedCheckpointStore(completedCheckpointStore).build();
    ExecutionJobVertex vertex = graph.getJobVertex(jobVertexID);
    coord.restoreInitialCheckpointIfPresent(Collections.singleton(vertex));
    TaskStateSnapshot restoredState = vertex.getTaskVertices()[0].getCurrentExecutionAttempt().getTaskRestore().getTaskStateSnapshot();
    assertTrue(restoredState.isTaskDeployedAsFinished());
}
Also used : HashMap(java.util.HashMap) CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) TestCompletedCheckpointStorageLocation(org.apache.flink.runtime.state.testutils.TestCompletedCheckpointStorageLocation) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair) Test(org.junit.Test)

Example 2 with CheckpointCoordinatorConfigurationBuilder

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder in project flink by apache.

the class CheckpointCoordinatorRestoringTest method testRestoreLatestCheckpointedStateWithoutInFlightData.

@Test
public void testRestoreLatestCheckpointedStateWithoutInFlightData() throws Exception {
    // given: Operator with not empty states.
    final JobVertexID jobVertexID = new JobVertexID();
    int parallelism1 = 3;
    int maxParallelism1 = 42;
    CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
    final ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, parallelism1, maxParallelism1).build();
    final ExecutionJobVertex jobVertex = graph.getJobVertex(jobVertexID);
    // set up the coordinator and validate the initial state
    CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(completedCheckpointStore).setCheckpointCoordinatorConfiguration(new CheckpointCoordinatorConfigurationBuilder().setCheckpointIdOfIgnoredInFlightData(1).build()).setTimer(manuallyTriggeredScheduledExecutor).build();
    // trigger the checkpoint
    coord.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertEquals(1, coord.getPendingCheckpoints().size());
    long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
    List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
    Random random = new Random();
    // fill the states and complete the checkpoint.
    for (int index = 0; index < jobVertex.getParallelism(); index++) {
        OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(generatePartitionableStateHandle(jobVertexID, index, 2, 8, false)).setRawOperatorState(generatePartitionableStateHandle(jobVertexID, index, 2, 8, true)).setManagedKeyedState(generateKeyGroupState(jobVertexID, keyGroupPartitions1.get(index), false)).setRawKeyedState(generateKeyGroupState(jobVertexID, keyGroupPartitions1.get(index), true)).setInputChannelState(StateObjectCollection.singleton(createNewInputChannelStateHandle(3, random))).setResultSubpartitionState(StateObjectCollection.singleton(createNewResultSubpartitionStateHandle(3, random))).build();
        TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
        taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID), operatorSubtaskState);
        AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(graph.getJobID(), jobVertex.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates);
        coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
    }
    assertEquals(1, coord.getSuccessfulCheckpoints().size());
    // when: Restore latest checkpoint without in-flight data.
    Set<ExecutionJobVertex> tasks = new HashSet<>();
    tasks.add(jobVertex);
    assertTrue(coord.restoreLatestCheckpointedStateToAll(tasks, false));
    // then: All states should be restored successfully except InputChannel and
    // ResultSubpartition which should be ignored.
    verifyStateRestore(jobVertexID, jobVertex, keyGroupPartitions1);
    for (int i = 0; i < jobVertex.getParallelism(); i++) {
        JobManagerTaskRestore taskRestore = jobVertex.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
        Assert.assertEquals(1L, taskRestore.getRestoreCheckpointId());
        TaskStateSnapshot stateSnapshot = taskRestore.getTaskStateSnapshot();
        OperatorSubtaskState operatorState = stateSnapshot.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID));
        assertTrue(operatorState.getInputChannelState().isEmpty());
        assertTrue(operatorState.getResultSubpartitionState().isEmpty());
        assertFalse(operatorState.getRawOperatorState().isEmpty());
        assertFalse(operatorState.getManagedOperatorState().isEmpty());
        assertFalse(operatorState.getRawKeyedState().isEmpty());
        assertFalse(operatorState.getManagedOperatorState().isEmpty());
    }
}
Also used : CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) Random(java.util.Random) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) HashSet(java.util.HashSet) Test(org.junit.Test)

Example 3 with CheckpointCoordinatorConfigurationBuilder

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder in project flink by apache.

the class CheckpointCoordinatorTriggeringTest method testPeriodicTriggering.

@Test
public void testPeriodicTriggering() {
    try {
        final long start = System.currentTimeMillis();
        CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
        JobVertexID jobVertexID = new JobVertexID();
        ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
        ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
        ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
        CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = new CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 10 ms
        10).setCheckpointTimeout(// timeout is very long (200 s)
        200000).setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build();
        CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(checkpointCoordinatorConfiguration).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
        checkpointCoordinator.startCheckpointScheduler();
        for (int i = 0; i < 5; ++i) {
            manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
            manuallyTriggeredScheduledExecutor.triggerAll();
        }
        checkRecordedTriggeredCheckpoints(5, start, gateway.getTriggeredCheckpoints(attemptID));
        checkpointCoordinator.stopCheckpointScheduler();
        // no further calls may come.
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        assertEquals(5, gateway.getTriggeredCheckpoints(attemptID).size());
        // start another sequence of periodic scheduling
        gateway.resetCount();
        checkpointCoordinator.startCheckpointScheduler();
        for (int i = 0; i < 5; ++i) {
            manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
            manuallyTriggeredScheduledExecutor.triggerAll();
        }
        checkRecordedTriggeredCheckpoints(5, start, gateway.getTriggeredCheckpoints(attemptID));
        checkpointCoordinator.stopCheckpointScheduler();
        // no further calls may come
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        assertEquals(5, gateway.getTriggeredCheckpoints(attemptID).size());
        checkpointCoordinator.shutdown();
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionException(java.util.concurrent.ExecutionException) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 4 with CheckpointCoordinatorConfigurationBuilder

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder in project flink by apache.

the class CheckpointCoordinatorTriggeringTest method testMinTimeBetweenCheckpointsInterval.

/**
 * This test verified that after a completed checkpoint a certain time has passed before another
 * is triggered.
 */
@Test
public void testMinTimeBetweenCheckpointsInterval() throws Exception {
    JobVertexID jobVertexID = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
    ExecutionAttemptID attemptID = vertex.getCurrentExecutionAttempt().getAttemptId();
    final long delay = 50;
    final long checkpointInterval = 12;
    CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = new CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 12 ms
    checkpointInterval).setCheckpointTimeout(// timeout is very long (200 s)
    200_000).setMinPauseBetweenCheckpoints(// 50 ms delay between checkpoints
    delay).setMaxConcurrentCheckpoints(1).build();
    final CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(checkpointCoordinatorConfiguration).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
    try {
        checkpointCoordinator.startCheckpointScheduler();
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        // wait until the first checkpoint was triggered
        Long firstCallId = gateway.getTriggeredCheckpoints(attemptID).get(0).checkpointId;
        assertEquals(1L, firstCallId.longValue());
        AcknowledgeCheckpoint ackMsg = new AcknowledgeCheckpoint(graph.getJobID(), attemptID, 1L);
        // tell the coordinator that the checkpoint is done
        final long ackTime = System.nanoTime();
        checkpointCoordinator.receiveAcknowledgeMessage(ackMsg, TASK_MANAGER_LOCATION_INFO);
        gateway.resetCount();
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        while (gateway.getTriggeredCheckpoints(attemptID).isEmpty()) {
            // sleeps for a while to simulate periodic scheduling
            Thread.sleep(checkpointInterval);
            manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
            manuallyTriggeredScheduledExecutor.triggerAll();
        }
        // wait until the next checkpoint is triggered
        Long nextCallId = gateway.getTriggeredCheckpoints(attemptID).get(0).checkpointId;
        final long nextCheckpointTime = System.nanoTime();
        assertEquals(2L, nextCallId.longValue());
        final long delayMillis = (nextCheckpointTime - ackTime) / 1_000_000;
        // we need to add one ms here to account for rounding errors
        if (delayMillis + 1 < delay) {
            fail("checkpoint came too early: delay was " + delayMillis + " but should have been at least " + delay);
        }
    } finally {
        checkpointCoordinator.stopCheckpointScheduler();
        checkpointCoordinator.shutdown();
    }
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) CheckpointCoordinatorConfigurationBuilder(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Aggregations

CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)4 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)4 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)4 CheckpointCoordinatorConfigurationBuilder (org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder)4 Test (org.junit.Test)4 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)3 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)2 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)2 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)2 CheckpointCoordinatorConfiguration (org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration)2 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 Random (java.util.Random)1 ExecutionException (java.util.concurrent.ExecutionException)1 OperatorIDPair (org.apache.flink.runtime.OperatorIDPair)1 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)1 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)1 TestCompletedCheckpointStorageLocation (org.apache.flink.runtime.state.testutils.TestCompletedCheckpointStorageLocation)1