Search in sources :

Example 16 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testMaxConcurrentAttempts.

private void testMaxConcurrentAttempts(int maxConcurrentAttempts) {
    try {
        JobVertexID jobVertexID1 = new JobVertexID();
        CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
        ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).setTaskManagerGateway(gateway).build();
        ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
        ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
        CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 10 ms
        10).setCheckpointTimeout(// timeout is very long (200 s)
        200000).setMinPauseBetweenCheckpoints(// no extra delay
        0L).setMaxConcurrentCheckpoints(maxConcurrentAttempts).build();
        CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
        checkpointCoordinator.startCheckpointScheduler();
        for (int i = 0; i < maxConcurrentAttempts; i++) {
            manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
            manuallyTriggeredScheduledExecutor.triggerAll();
        }
        assertEquals(maxConcurrentAttempts, gateway.getTriggeredCheckpoints(attemptID1).size());
        assertEquals(0, gateway.getNotifiedCompletedCheckpoints(attemptID1).size());
        // now, once we acknowledge one checkpoint, it should trigger the next one
        checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, 1L), TASK_MANAGER_LOCATION_INFO);
        final Collection<ScheduledFuture<?>> periodicScheduledTasks = manuallyTriggeredScheduledExecutor.getActivePeriodicScheduledTask();
        assertEquals(1, periodicScheduledTasks.size());
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        assertEquals(maxConcurrentAttempts + 1, gateway.getTriggeredCheckpoints(attemptID1).size());
        // no further checkpoints should happen
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
        assertEquals(maxConcurrentAttempts + 1, gateway.getTriggeredCheckpoints(attemptID1).size());
        checkpointCoordinator.shutdown();
    } catch (Exception e) {
        e.printStackTrace();
        fail(e.getMessage());
    }
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ScheduledFuture(java.util.concurrent.ScheduledFuture) TriFunctionWithException(org.apache.flink.util.function.TriFunctionWithException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) RpcException(org.apache.flink.runtime.rpc.exceptions.RpcException) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph)

Example 17 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testHandleMessagesForNonExistingCheckpoints.

@Test
public void testHandleMessagesForNonExistingCheckpoints() throws Exception {
    // create some mock execution vertices and trigger some checkpoint
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2, false).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
    long checkpointId = checkpointCoordinator.getPendingCheckpoints().keySet().iterator().next();
    // send some messages that do not belong to either the job or the any
    // of the vertices that need to be acknowledged.
    // non of the messages should throw an exception
    // wrong job id
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(new JobID(), attemptID1, checkpointId), TASK_MANAGER_LOCATION_INFO);
    // unknown checkpoint
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, 1L), TASK_MANAGER_LOCATION_INFO);
    // unknown ack vertex
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), new ExecutionAttemptID(), checkpointId), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.shutdown();
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Example 18 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testMultipleConcurrentCheckpoints.

@Test
public void testMultipleConcurrentCheckpoints() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    JobVertexID jobVertexID3 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).addJobVertex(jobVertexID3, false).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionVertex vertex3 = graph.getJobVertex(jobVertexID3).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID3 = vertex3.getCurrentExecutionAttempt().getAttemptId();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    // trigger the first checkpoint. this should succeed
    final CompletableFuture<CompletedCheckpoint> checkpointFuture1 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture1);
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    PendingCheckpoint pending1 = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
    long checkpointId1 = pending1.getCheckpointId();
    // trigger messages should have been sent
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId1, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
    }
    // acknowledge one of the three tasks
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId1), TASK_MANAGER_LOCATION_INFO);
    // start the second checkpoint
    // trigger the first checkpoint. this should succeed
    gateway.resetCount();
    final CompletableFuture<CompletedCheckpoint> checkpointFuture2 = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture2);
    assertEquals(2, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    PendingCheckpoint pending2;
    {
        Iterator<PendingCheckpoint> all = checkpointCoordinator.getPendingCheckpoints().values().iterator();
        PendingCheckpoint cc1 = all.next();
        PendingCheckpoint cc2 = all.next();
        pending2 = pending1 == cc1 ? cc2 : cc1;
    }
    long checkpointId2 = pending2.getCheckpointId();
    // trigger messages should have been sent
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId2, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
    }
    // we acknowledge the remaining two tasks from the first
    // checkpoint and two tasks from the second checkpoint
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID3, checkpointId1), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId2), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId1), TASK_MANAGER_LOCATION_INFO);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId2), TASK_MANAGER_LOCATION_INFO);
    // now, the first checkpoint should be confirmed
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertTrue(pending1.isDisposed());
    // the first confirm message should be out
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2, vertex3)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId1, gateway.getOnlyNotifiedCompletedCheckpoint(attemptId).checkpointId);
    }
    // send the last remaining ack for the second checkpoint
    gateway.resetCount();
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID3, checkpointId2), TASK_MANAGER_LOCATION_INFO);
    // now, the second checkpoint should be confirmed
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(2, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertTrue(pending2.isDisposed());
    // the second commit message should be out
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2, vertex3)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId2, gateway.getOnlyNotifiedCompletedCheckpoint(attemptId).checkpointId);
    }
    // validate the committed checkpoints
    List<CompletedCheckpoint> scs = checkpointCoordinator.getSuccessfulCheckpoints();
    CompletedCheckpoint sc1 = scs.get(0);
    assertEquals(checkpointId1, sc1.getCheckpointID());
    assertEquals(graph.getJobID(), sc1.getJobId());
    assertEquals(3, sc1.getOperatorStates().size());
    assertTrue(sc1.getOperatorStates().values().stream().allMatch(this::hasNoSubState));
    CompletedCheckpoint sc2 = scs.get(1);
    assertEquals(checkpointId2, sc2.getCheckpointID());
    assertEquals(graph.getJobID(), sc2.getJobId());
    assertEquals(3, sc2.getOperatorStates().size());
    assertTrue(sc2.getOperatorStates().values().stream().allMatch(this::hasNoSubState));
    checkpointCoordinator.shutdown();
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Iterator(java.util.Iterator) Test(org.junit.Test)

Example 19 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testSharedStateNotDiscaredOnAbort.

@Test
public void testSharedStateNotDiscaredOnAbort() throws Exception {
    JobVertexID v1 = new JobVertexID(), v2 = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(v1).addJobVertex(v2).build();
    CheckpointCoordinator coordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setTimer(manuallyTriggeredScheduledExecutor).build();
    coordinator.startCheckpointScheduler();
    CompletableFuture<CompletedCheckpoint> cpFuture = coordinator.triggerCheckpoint(true);
    manuallyTriggeredScheduledExecutor.triggerAll();
    cpFuture.getNow(null);
    TestingStreamStateHandle metaState = handle();
    TestingStreamStateHandle privateState = handle();
    TestingStreamStateHandle sharedState = handle();
    ackCheckpoint(1L, coordinator, v1, graph, metaState, privateState, sharedState);
    declineCheckpoint(1L, coordinator, v2, graph);
    assertTrue(privateState.isDisposed());
    assertTrue(metaState.isDisposed());
    assertFalse(sharedState.isDisposed());
    cpFuture = coordinator.triggerCheckpoint(true);
    manuallyTriggeredScheduledExecutor.triggerAll();
    cpFuture.getNow(null);
    ackCheckpoint(2L, coordinator, v1, graph, handle(), handle(), handle());
    ackCheckpoint(2L, coordinator, v2, graph, handle(), handle(), handle());
    cpFuture.get();
    assertTrue(sharedState.isDisposed());
}
Also used : TestingStreamStateHandle(org.apache.flink.runtime.state.TestingStreamStateHandle) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) Test(org.junit.Test)

Example 20 with CheckpointCoordinatorBuilder

use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.

the class CheckpointCoordinatorTest method testReportStatsAfterFailure.

private void testReportStatsAfterFailure(long checkpointId, TriFunctionWithException<CheckpointCoordinator, Execution, CheckpointMetrics, ?, CheckpointException> reportFn) throws Exception {
    JobVertexID decliningVertexID = new JobVertexID();
    JobVertexID lateReportVertexID = new JobVertexID();
    ExecutionGraph executionGraph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(decliningVertexID).addJobVertex(lateReportVertexID).build();
    ExecutionVertex decliningVertex = executionGraph.getJobVertex(decliningVertexID).getTaskVertices()[0];
    ExecutionVertex lateReportVertex = executionGraph.getJobVertex(lateReportVertexID).getTaskVertices()[0];
    CheckpointStatsTracker statsTracker = new CheckpointStatsTracker(Integer.MAX_VALUE, new UnregisteredMetricsGroup());
    CheckpointCoordinator coordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(executionGraph).setTimer(manuallyTriggeredScheduledExecutor).setCheckpointStatsTracker(statsTracker).build();
    CompletableFuture<CompletedCheckpoint> result = coordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    checkState(coordinator.getNumberOfPendingCheckpoints() == 1, "wrong number of pending checkpoints: %s", coordinator.getNumberOfPendingCheckpoints());
    if (result.isDone()) {
        result.get();
    }
    coordinator.receiveDeclineMessage(new DeclineCheckpoint(executionGraph.getJobID(), decliningVertex.getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointException(CHECKPOINT_DECLINED)), "test");
    CheckpointMetrics lateReportedMetrics = new CheckpointMetricsBuilder().setTotalBytesPersisted(18).setBytesPersistedOfThisCheckpoint(18).setBytesProcessedDuringAlignment(19).setAsyncDurationMillis(20).setAlignmentDurationNanos(123 * 1_000_000).setCheckpointStartDelayNanos(567 * 1_000_000).build();
    reportFn.apply(coordinator, lateReportVertex.getCurrentExecutionAttempt(), lateReportedMetrics);
    assertStatsEqual(checkpointId, lateReportVertex.getJobvertexId(), 0, lateReportedMetrics, statsTracker.createSnapshot().getHistory().getCheckpointById(checkpointId));
}
Also used : DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) UnregisteredMetricsGroup(org.apache.flink.metrics.groups.UnregisteredMetricsGroup) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph)

Aggregations

CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)46 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)41 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)40 Test (org.junit.Test)37 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)30 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)27 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)22 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)15 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)13 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)13 HashSet (java.util.HashSet)12 CheckpointCoordinatorConfiguration (org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration)12 ManuallyTriggeredScheduledExecutor (org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor)10 CompletableFuture (java.util.concurrent.CompletableFuture)9 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)9 HashMap (java.util.HashMap)8 CheckpointCoordinatorConfigurationBuilder (org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder)8 List (java.util.List)7 ExecutionException (java.util.concurrent.ExecutionException)7 JobID (org.apache.flink.api.common.JobID)7