Search in sources :

Example 6 with OperatorID

use of org.apache.flink.runtime.jobgraph.OperatorID in project flink by apache.

the class CheckpointCoordinatorRestoringTest method generateIDPair.

private static Tuple2<JobVertexID, OperatorID> generateIDPair() {
    JobVertexID jobVertexID = new JobVertexID();
    OperatorID operatorID = OperatorID.fromJobVertexID(jobVertexID);
    return new Tuple2<>(jobVertexID, operatorID);
}
Also used : Tuple2(org.apache.flink.api.java.tuple.Tuple2) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID)

Example 7 with OperatorID

use of org.apache.flink.runtime.jobgraph.OperatorID in project flink by apache.

the class CheckpointCoordinatorTest method testTriggerAndConfirmSimpleSavepoint.

@Test
public void testTriggerAndConfirmSimpleSavepoint() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = getCheckpointCoordinator(graph);
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    // trigger the first checkpoint. this should succeed
    String savepointDir = tmpFolder.newFolder().getAbsolutePath();
    CompletableFuture<CompletedCheckpoint> savepointFuture = checkpointCoordinator.triggerSavepoint(savepointDir, SavepointFormatType.CANONICAL);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertFalse(savepointFuture.isDone());
    // validate that we have a pending savepoint
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    long checkpointId = checkpointCoordinator.getPendingCheckpoints().entrySet().iterator().next().getKey();
    PendingCheckpoint pending = checkpointCoordinator.getPendingCheckpoints().get(checkpointId);
    assertNotNull(pending);
    assertEquals(checkpointId, pending.getCheckpointId());
    assertEquals(graph.getJobID(), pending.getJobId());
    assertEquals(2, pending.getNumberOfNonAcknowledgedTasks());
    assertEquals(0, pending.getNumberOfAcknowledgedTasks());
    assertEquals(0, pending.getOperatorStates().size());
    assertFalse(pending.isDisposed());
    assertFalse(pending.areTasksFullyAcknowledged());
    assertFalse(pending.canBeSubsumed());
    OperatorID opID1 = OperatorID.fromJobVertexID(vertex1.getJobvertexId());
    OperatorID opID2 = OperatorID.fromJobVertexID(vertex2.getJobvertexId());
    OperatorSubtaskState subtaskState1 = mock(OperatorSubtaskState.class);
    OperatorSubtaskState subtaskState2 = mock(OperatorSubtaskState.class);
    TaskStateSnapshot taskOperatorSubtaskStates1 = new TaskStateSnapshot(singletonMap(opID1, subtaskState1));
    TaskStateSnapshot taskOperatorSubtaskStates2 = new TaskStateSnapshot(singletonMap(opID2, subtaskState2));
    // acknowledge from one of the tasks
    AcknowledgeCheckpoint acknowledgeCheckpoint2 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates2);
    checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
    assertEquals(1, pending.getNumberOfAcknowledgedTasks());
    assertEquals(1, pending.getNumberOfNonAcknowledgedTasks());
    assertFalse(pending.isDisposed());
    assertFalse(pending.areTasksFullyAcknowledged());
    assertFalse(savepointFuture.isDone());
    // acknowledge the same task again (should not matter)
    checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
    assertFalse(pending.isDisposed());
    assertFalse(pending.areTasksFullyAcknowledged());
    assertFalse(savepointFuture.isDone());
    // acknowledge the other task.
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates1), TASK_MANAGER_LOCATION_INFO);
    // the checkpoint is internally converted to a successful checkpoint and the
    // pending checkpoint object is disposed
    assertTrue(pending.isDisposed());
    assertNotNull(savepointFuture.get());
    // the now we should have a completed checkpoint
    // savepoints should not registered as retained checkpoints
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    // validate that the relevant tasks got a confirmation message
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
        assertThat(gateway.getNotifiedCompletedCheckpoints(attemptId)).isEmpty();
    }
    CompletedCheckpoint success = savepointFuture.get();
    assertEquals(graph.getJobID(), success.getJobId());
    assertEquals(pending.getCheckpointId(), success.getCheckpointID());
    assertEquals(2, success.getOperatorStates().size());
    checkpointCoordinator.shutdown();
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 8 with OperatorID

use of org.apache.flink.runtime.jobgraph.OperatorID in project flink by apache.

the class CheckpointCoordinatorTest method testExternallyInducedSourceWithOperatorCoordinator.

/**
 * Test that the checkpoint still behave correctly when the task checkpoint is triggered by the
 * master hooks and finished before the master checkpoint. Also make sure that the operator
 * coordinators are checkpointed before starting the task checkpoint.
 */
@Test
public void testExternallyInducedSourceWithOperatorCoordinator() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    ExecutionAttemptID attemptID2 = vertex2.getCurrentExecutionAttempt().getAttemptId();
    OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    OperatorID opID2 = vertex2.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    TaskStateSnapshot taskOperatorSubtaskStates1 = new TaskStateSnapshot();
    TaskStateSnapshot taskOperatorSubtaskStates2 = new TaskStateSnapshot();
    OperatorSubtaskState subtaskState1 = OperatorSubtaskState.builder().build();
    OperatorSubtaskState subtaskState2 = OperatorSubtaskState.builder().build();
    taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);
    taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID2, subtaskState2);
    // Create a mock OperatorCoordinatorCheckpointContext which completes the checkpoint
    // immediately.
    AtomicBoolean coordCheckpointDone = new AtomicBoolean(false);
    OperatorCoordinatorCheckpointContext coordinatorCheckpointContext = new CheckpointCoordinatorTestingUtils.MockOperatorCheckpointCoordinatorContextBuilder().setOnCallingCheckpointCoordinator((checkpointId, result) -> {
        coordCheckpointDone.set(true);
        result.complete(new byte[0]);
    }).setOperatorID(opID1).build();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCoordinatorsToCheckpoint(Collections.singleton(coordinatorCheckpointContext)).build();
    AtomicReference<Long> checkpointIdRef = new AtomicReference<>();
    // Add a master hook which triggers and acks the task checkpoint immediately.
    // In this case the task checkpoints would complete before the job master checkpoint
    // completes.
    checkpointCoordinator.addMasterHook(new MasterTriggerRestoreHook<Integer>() {

        @Override
        public String getIdentifier() {
            return "anything";
        }

        @Override
        @Nullable
        public CompletableFuture<Integer> triggerCheckpoint(long checkpointId, long timestamp, Executor executor) throws Exception {
            assertTrue("The coordinator checkpoint should have finished.", coordCheckpointDone.get());
            // Acknowledge the checkpoint in the master hooks so the task snapshots
            // complete before
            // the master state snapshot completes.
            checkpointIdRef.set(checkpointId);
            AcknowledgeCheckpoint acknowledgeCheckpoint1 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates1);
            AcknowledgeCheckpoint acknowledgeCheckpoint2 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID2, checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates2);
            checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint1, TASK_MANAGER_LOCATION_INFO);
            checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint2, TASK_MANAGER_LOCATION_INFO);
            return null;
        }

        @Override
        public void restoreCheckpoint(long checkpointId, Integer checkpointData) throws Exception {
        }

        @Override
        public SimpleVersionedSerializer<Integer> createCheckpointDataSerializer() {
            return new SimpleVersionedSerializer<Integer>() {

                @Override
                public int getVersion() {
                    return 0;
                }

                @Override
                public byte[] serialize(Integer obj) throws IOException {
                    return new byte[0];
                }

                @Override
                public Integer deserialize(int version, byte[] serialized) throws IOException {
                    return 1;
                }
            };
        }
    });
    // Verify initial state.
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
    // trigger the first checkpoint. this should succeed
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
    // now we should have a completed checkpoint
    assertEquals(1, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    // the canceler should be removed now
    assertEquals(0, manuallyTriggeredScheduledExecutor.getActiveScheduledTasks().size());
    // validate that the relevant tasks got a confirmation message
    long checkpointId = checkpointIdRef.get();
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(checkpointId, gateway.getOnlyTriggeredCheckpoint(attemptId).checkpointId);
    }
    CompletedCheckpoint success = checkpointCoordinator.getSuccessfulCheckpoints().get(0);
    assertEquals(graph.getJobID(), success.getJobId());
    assertEquals(2, success.getOperatorStates().size());
    checkpointCoordinator.shutdown();
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CompletableFuture(java.util.concurrent.CompletableFuture) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Executor(java.util.concurrent.Executor) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) SimpleVersionedSerializer(org.apache.flink.core.io.SimpleVersionedSerializer) AtomicReference(java.util.concurrent.atomic.AtomicReference) IOException(java.io.IOException) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) TriFunctionWithException(org.apache.flink.util.function.TriFunctionWithException) IOException(java.io.IOException) ExecutionException(java.util.concurrent.ExecutionException) RpcException(org.apache.flink.runtime.rpc.exceptions.RpcException) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) AtomicLong(java.util.concurrent.atomic.AtomicLong) ArgumentMatchers.anyLong(org.mockito.ArgumentMatchers.anyLong) Nullable(javax.annotation.Nullable) Test(org.junit.Test)

Example 9 with OperatorID

use of org.apache.flink.runtime.jobgraph.OperatorID in project flink by apache.

the class CheckpointCoordinatorTest method testCheckpointTimeoutIsolated.

@Test
public void testCheckpointTimeoutIsolated() throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    JobVertexID jobVertexID2 = new JobVertexID();
    CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway gateway = new CheckpointCoordinatorTestingUtils.CheckpointRecorderTaskManagerGateway();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).addJobVertex(jobVertexID2, false).setTaskManagerGateway(gateway).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionVertex vertex2 = graph.getJobVertex(jobVertexID2).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    // set up the coordinator
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
    // trigger a checkpoint, partially acknowledged
    final CompletableFuture<CompletedCheckpoint> checkpointFuture = checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    FutureUtils.throwIfCompletedExceptionally(checkpointFuture);
    assertEquals(1, checkpointCoordinator.getNumberOfPendingCheckpoints());
    PendingCheckpoint checkpoint = checkpointCoordinator.getPendingCheckpoints().values().iterator().next();
    assertFalse(checkpoint.isDisposed());
    OperatorID opID1 = vertex1.getJobVertex().getOperatorIDs().get(0).getGeneratedOperatorID();
    TaskStateSnapshot taskOperatorSubtaskStates1 = spy(new TaskStateSnapshot());
    OperatorSubtaskState subtaskState1 = mock(OperatorSubtaskState.class);
    taskOperatorSubtaskStates1.putSubtaskStateByOperatorID(opID1, subtaskState1);
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, checkpoint.getCheckpointId(), new CheckpointMetrics(), taskOperatorSubtaskStates1), TASK_MANAGER_LOCATION_INFO);
    // triggers cancelling
    manuallyTriggeredScheduledExecutor.triggerScheduledTasks();
    assertTrue("Checkpoint was not canceled by the timeout", checkpoint.isDisposed());
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertEquals(0, checkpointCoordinator.getNumberOfRetainedSuccessfulCheckpoints());
    // validate that the received states have been discarded
    verify(subtaskState1, times(1)).discardState();
    // no confirm message must have been sent
    for (ExecutionVertex vertex : Arrays.asList(vertex1, vertex2)) {
        ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
        assertEquals(0, gateway.getNotifiedCompletedCheckpoints(attemptId).size());
    }
    checkpointCoordinator.shutdown();
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 10 with OperatorID

use of org.apache.flink.runtime.jobgraph.OperatorID in project flink by apache.

the class CheckpointCoordinatorTest method ackCheckpoint.

private void ackCheckpoint(long checkpointId, CheckpointCoordinator coordinator, JobVertexID ackVertexID, ExecutionGraph graph, TestingStreamStateHandle metaState, TestingStreamStateHandle privateState, TestingStreamStateHandle sharedState) throws CheckpointException {
    Map<StateHandleID, StreamStateHandle> sharedStateMap = new HashMap<>(singletonMap(new StateHandleID("shared-state-key"), sharedState));
    Map<StateHandleID, StreamStateHandle> privateStateMap = new HashMap<>(singletonMap(new StateHandleID("private-state-key"), privateState));
    ExecutionJobVertex jobVertex = graph.getJobVertex(ackVertexID);
    OperatorID operatorID = jobVertex.getOperatorIDs().get(0).getGeneratedOperatorID();
    coordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), jobVertex.getTaskVertices()[0].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), new TaskStateSnapshot(singletonMap(operatorID, OperatorSubtaskState.builder().setManagedKeyedState(new IncrementalRemoteKeyedStateHandle(UUID.randomUUID(), KeyGroupRange.of(0, 9), checkpointId, sharedStateMap, privateStateMap, metaState)).build()))), "test");
}
Also used : AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) PlaceholderStreamStateHandle(org.apache.flink.runtime.state.PlaceholderStreamStateHandle) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) ByteStreamStateHandle(org.apache.flink.runtime.state.memory.ByteStreamStateHandle) OperatorStreamStateHandle(org.apache.flink.runtime.state.OperatorStreamStateHandle) TestingStreamStateHandle(org.apache.flink.runtime.state.TestingStreamStateHandle) HashMap(java.util.HashMap) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) StateHandleID(org.apache.flink.runtime.state.StateHandleID) IncrementalRemoteKeyedStateHandle(org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID)

Aggregations

OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)211 Test (org.junit.Test)132 HashMap (java.util.HashMap)46 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)44 StreamConfig (org.apache.flink.streaming.api.graph.StreamConfig)41 JobID (org.apache.flink.api.common.JobID)38 Configuration (org.apache.flink.configuration.Configuration)30 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)28 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)28 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)24 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)23 OperatorSubtaskState (org.apache.flink.runtime.checkpoint.OperatorSubtaskState)21 OperatorStateHandle (org.apache.flink.runtime.state.OperatorStateHandle)21 ArrayList (java.util.ArrayList)20 HashSet (java.util.HashSet)20 TaskStateSnapshot (org.apache.flink.runtime.checkpoint.TaskStateSnapshot)19 OperatorStreamStateHandle (org.apache.flink.runtime.state.OperatorStreamStateHandle)19 MemoryStateBackend (org.apache.flink.runtime.state.memory.MemoryStateBackend)19 IOException (java.io.IOException)18 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)18