Search in sources :

Example 31 with AcknowledgeCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.

the class CheckpointCoordinatorTest method testNotifyCheckpointAbortionInOperatorCoordinator.

@Test
public void testNotifyCheckpointAbortionInOperatorCoordinator() throws Exception {
    JobVertexID jobVertexID = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID).build();
    ExecutionVertex executionVertex = graph.getJobVertex(jobVertexID).getTaskVertices()[0];
    ExecutionAttemptID attemptID = executionVertex.getCurrentExecutionAttempt().getAttemptId();
    CheckpointCoordinatorTestingUtils.MockOperatorCoordinatorCheckpointContext context = new CheckpointCoordinatorTestingUtils.MockOperatorCheckpointCoordinatorContextBuilder().setOperatorID(new OperatorID()).setOnCallingCheckpointCoordinator((ignored, future) -> future.complete(new byte[0])).build();
    // set up the coordinator and validate the initial state
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(CheckpointCoordinatorConfiguration.builder().setMaxConcurrentCheckpoints(Integer.MAX_VALUE).build()).setTimer(manuallyTriggeredScheduledExecutor).setCoordinatorsToCheckpoint(Collections.singleton(context)).build();
    try {
        // Trigger checkpoint 1.
        checkpointCoordinator.triggerCheckpoint(false);
        manuallyTriggeredScheduledExecutor.triggerAll();
        long checkpointId1 = Collections.max(checkpointCoordinator.getPendingCheckpoints().keySet());
        // Trigger checkpoint 2.
        checkpointCoordinator.triggerCheckpoint(false);
        manuallyTriggeredScheduledExecutor.triggerAll();
        // Acknowledge checkpoint 2. This should abort checkpoint 1.
        long checkpointId2 = Collections.max(checkpointCoordinator.getPendingCheckpoints().keySet());
        AcknowledgeCheckpoint acknowledgeCheckpoint1 = new AcknowledgeCheckpoint(graph.getJobID(), attemptID, checkpointId2, new CheckpointMetrics(), null);
        checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint1, "");
        // OperatorCoordinator should have been notified of the abortion of checkpoint 1.
        assertEquals(Collections.singletonList(1L), context.getAbortedCheckpoints());
        assertEquals(Collections.singletonList(2L), context.getCompletedCheckpoints());
    } finally {
        checkpointCoordinator.shutdown();
    }
}
Also used : ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) Arrays(java.util.Arrays) UnregisteredMetricsGroup(org.apache.flink.metrics.groups.UnregisteredMetricsGroup) Tuple2(org.apache.flink.api.java.tuple.Tuple2) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) Future(java.util.concurrent.Future) Path(org.apache.flink.core.fs.Path) Map(java.util.Map) CHECKPOINT_DECLINED(org.apache.flink.runtime.checkpoint.CheckpointFailureReason.CHECKPOINT_DECLINED) ScheduledExecutor(org.apache.flink.util.concurrent.ScheduledExecutor) Set(java.util.Set) FsStateBackend(org.apache.flink.runtime.state.filesystem.FsStateBackend) ArgumentMatchers.anyList(org.mockito.ArgumentMatchers.anyList) Executors(java.util.concurrent.Executors) VerificationMode(org.mockito.verification.VerificationMode) Matchers.any(org.mockito.Matchers.any) PlaceholderStreamStateHandle(org.apache.flink.runtime.state.PlaceholderStreamStateHandle) SimpleVersionedSerializer(org.apache.flink.core.io.SimpleVersionedSerializer) Assert.assertFalse(org.junit.Assert.assertFalse) MemoryBackendCheckpointStorageAccess(org.apache.flink.runtime.state.memory.MemoryBackendCheckpointStorageAccess) Mockito.eq(org.mockito.Mockito.eq) Mockito.mock(org.mockito.Mockito.mock) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) StateHandleID(org.apache.flink.runtime.state.StateHandleID) JobStatus(org.apache.flink.api.common.JobStatus) Mockito.spy(org.mockito.Mockito.spy) ArrayList(java.util.ArrayList) DirectScheduledExecutorService(org.apache.flink.runtime.testutils.DirectScheduledExecutorService) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) Iterables(org.apache.flink.shaded.guava30.com.google.common.collect.Iterables) Nullable(javax.annotation.Nullable) Before(org.junit.Before) NonPersistentMetadataCheckpointStorageLocation(org.apache.flink.runtime.state.memory.NonPersistentMetadataCheckpointStorageLocation) CheckpointStorageLocation(org.apache.flink.runtime.state.CheckpointStorageLocation) Executor(java.util.concurrent.Executor) TriFunctionWithException(org.apache.flink.util.function.TriFunctionWithException) ExecutionState(org.apache.flink.runtime.execution.ExecutionState) Assert.assertTrue(org.junit.Assert.assertTrue) Test(org.junit.Test) IOException(java.io.IOException) Mockito.times(org.mockito.Mockito.times) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) SimpleAckingTaskManagerGateway(org.apache.flink.runtime.executiongraph.utils.SimpleAckingTaskManagerGateway) File(java.io.File) Assert.assertNotEquals(org.junit.Assert.assertNotEquals) ExecutionException(java.util.concurrent.ExecutionException) AtomicLong(java.util.concurrent.atomic.AtomicLong) Mockito.never(org.mockito.Mockito.never) JobID(org.apache.flink.api.common.JobID) ByteStreamStateHandle(org.apache.flink.runtime.state.memory.ByteStreamStateHandle) Assert(org.junit.Assert) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) JobManagerCheckpointStorage(org.apache.flink.runtime.state.storage.JobManagerCheckpointStorage) Mockito.reset(org.mockito.Mockito.reset) Assert.assertEquals(org.junit.Assert.assertEquals) ComponentMainThreadExecutorServiceAdapter(org.apache.flink.runtime.concurrent.ComponentMainThreadExecutorServiceAdapter) ScheduledFuture(java.util.concurrent.ScheduledFuture) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) SharedStateRegistry(org.apache.flink.runtime.state.SharedStateRegistry) ExceptionUtils(org.apache.flink.util.ExceptionUtils) Random(java.util.Random) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) CheckpointStorage(org.apache.flink.runtime.state.CheckpointStorage) IncrementalRemoteKeyedStateHandle(org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle) TestingLogicalSlotBuilder(org.apache.flink.runtime.jobmaster.TestingLogicalSlotBuilder) RpcException(org.apache.flink.runtime.rpc.exceptions.RpcException) TestLogger(org.apache.flink.util.TestLogger) Mockito.atLeast(org.mockito.Mockito.atLeast) CheckpointStorageAccess(org.apache.flink.runtime.state.CheckpointStorageAccess) Assert.fail(org.junit.Assert.fail) Preconditions.checkNotNull(org.apache.flink.util.Preconditions.checkNotNull) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) KeyedStateHandle(org.apache.flink.runtime.state.KeyedStateHandle) Collection(java.util.Collection) CHECKPOINT_ASYNC_EXCEPTION(org.apache.flink.runtime.checkpoint.CheckpointFailureReason.CHECKPOINT_ASYNC_EXCEPTION) UUID(java.util.UUID) MemoryStateBackend(org.apache.flink.runtime.state.memory.MemoryStateBackend) Acknowledge(org.apache.flink.runtime.messages.Acknowledge) IO_EXCEPTION(org.apache.flink.runtime.checkpoint.CheckpointFailureReason.IO_EXCEPTION) List(java.util.List) CheckpointMetadataOutputStream(org.apache.flink.runtime.state.CheckpointMetadataOutputStream) FileSystem(org.apache.flink.core.fs.FileSystem) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) Optional(java.util.Optional) KeyGroupRangeAssignment(org.apache.flink.runtime.state.KeyGroupRangeAssignment) CHECKPOINT_EXPIRED(org.apache.flink.runtime.checkpoint.CheckpointFailureReason.CHECKPOINT_EXPIRED) ArgumentMatchers.anyLong(org.mockito.ArgumentMatchers.anyLong) SavepointFormatType(org.apache.flink.core.execution.SavepointFormatType) FileStateHandle(org.apache.flink.runtime.state.filesystem.FileStateHandle) TestCompletedCheckpointStorageLocation(org.apache.flink.runtime.state.testutils.TestCompletedCheckpointStorageLocation) ScheduledExecutorServiceAdapter(org.apache.flink.util.concurrent.ScheduledExecutorServiceAdapter) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) HashMap(java.util.HashMap) CompletableFuture(java.util.concurrent.CompletableFuture) OperatorStreamStateHandle(org.apache.flink.runtime.state.OperatorStreamStateHandle) AtomicReference(java.util.concurrent.atomic.AtomicReference) Function(java.util.function.Function) Execution(org.apache.flink.runtime.executiongraph.Execution) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) HashSet(java.util.HashSet) PERIODIC_SCHEDULER_SHUTDOWN(org.apache.flink.runtime.checkpoint.CheckpointFailureReason.PERIODIC_SCHEDULER_SHUTDOWN) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) Collections.singletonMap(java.util.Collections.singletonMap) Preconditions.checkState(org.apache.flink.util.Preconditions.checkState) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) Iterator(java.util.Iterator) Assert.assertNotNull(org.junit.Assert.assertNotNull) ExecutionGraphTestUtils(org.apache.flink.runtime.executiongraph.ExecutionGraphTestUtils) LogicalSlot(org.apache.flink.runtime.jobmaster.LogicalSlot) Mockito.when(org.mockito.Mockito.when) INVALID_CHECKPOINT_ID(org.apache.flink.runtime.checkpoint.CheckpointStoreUtil.INVALID_CHECKPOINT_ID) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) TestingStreamStateHandle(org.apache.flink.runtime.state.TestingStreamStateHandle) Mockito.verify(org.mockito.Mockito.verify) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) Rule(org.junit.Rule) Collections(java.util.Collections) TemporaryFolder(org.junit.rules.TemporaryFolder) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Test(org.junit.Test)

Example 32 with AcknowledgeCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.

the class CheckpointCoordinatorTest method performIncrementalCheckpoint.

private void performIncrementalCheckpoint(JobID jobId, CheckpointCoordinator checkpointCoordinator, ExecutionJobVertex jobVertex1, List<KeyGroupRange> keyGroupPartitions1, int cpSequenceNumber) throws Exception {
    // trigger the checkpoint
    checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertEquals(1, checkpointCoordinator.getPendingCheckpoints().size());
    long checkpointId = Iterables.getOnlyElement(checkpointCoordinator.getPendingCheckpoints().keySet());
    for (int index = 0; index < jobVertex1.getParallelism(); index++) {
        KeyGroupRange keyGroupRange = keyGroupPartitions1.get(index);
        Map<StateHandleID, StreamStateHandle> privateState = new HashMap<>();
        privateState.put(new StateHandleID("private-1"), spy(new ByteStreamStateHandle("private-1", new byte[] { 'p' })));
        Map<StateHandleID, StreamStateHandle> sharedState = new HashMap<>();
        // let all but the first CP overlap by one shared state.
        if (cpSequenceNumber > 0) {
            sharedState.put(new StateHandleID("shared-" + (cpSequenceNumber - 1)), spy(new PlaceholderStreamStateHandle(1L)));
        }
        sharedState.put(new StateHandleID("shared-" + cpSequenceNumber), spy(new ByteStreamStateHandle("shared-" + cpSequenceNumber + "-" + keyGroupRange, new byte[] { 's' })));
        IncrementalRemoteKeyedStateHandle managedState = spy(new IncrementalRemoteKeyedStateHandle(new UUID(42L, 42L), keyGroupRange, checkpointId, sharedState, privateState, spy(new ByteStreamStateHandle("meta", new byte[] { 'm' }))));
        OperatorSubtaskState operatorSubtaskState = spy(OperatorSubtaskState.builder().setManagedKeyedState(managedState).build());
        Map<OperatorID, OperatorSubtaskState> opStates = new HashMap<>();
        opStates.put(jobVertex1.getOperatorIDs().get(0).getGeneratedOperatorID(), operatorSubtaskState);
        TaskStateSnapshot taskStateSnapshot = new TaskStateSnapshot(opStates);
        AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(jobId, jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskStateSnapshot);
        checkpointCoordinator.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
    }
}
Also used : PlaceholderStreamStateHandle(org.apache.flink.runtime.state.PlaceholderStreamStateHandle) HashMap(java.util.HashMap) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) ByteStreamStateHandle(org.apache.flink.runtime.state.memory.ByteStreamStateHandle) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) PlaceholderStreamStateHandle(org.apache.flink.runtime.state.PlaceholderStreamStateHandle) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) ByteStreamStateHandle(org.apache.flink.runtime.state.memory.ByteStreamStateHandle) OperatorStreamStateHandle(org.apache.flink.runtime.state.OperatorStreamStateHandle) TestingStreamStateHandle(org.apache.flink.runtime.state.TestingStreamStateHandle) StateHandleID(org.apache.flink.runtime.state.StateHandleID) IncrementalRemoteKeyedStateHandle(org.apache.flink.runtime.state.IncrementalRemoteKeyedStateHandle) UUID(java.util.UUID)

Example 33 with AcknowledgeCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.

the class CheckpointCoordinatorFailureTest method testStoringFailureHandling.

private void testStoringFailureHandling(Exception failure, int expectedCleanupCalls) throws Exception {
    final JobVertexID jobVertexID1 = new JobVertexID();
    final ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).build();
    final ExecutionVertex vertex = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    final ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
    final StandaloneCheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
    final ManuallyTriggeredScheduledExecutor manuallyTriggeredScheduledExecutor = new ManuallyTriggeredScheduledExecutor();
    final CompletedCheckpointStore completedCheckpointStore = new FailingCompletedCheckpointStore(failure);
    final AtomicInteger cleanupCallCount = new AtomicInteger(0);
    final CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointIDCounter(checkpointIDCounter).setCheckpointsCleaner(new CheckpointsCleaner() {

        private static final long serialVersionUID = 2029876992397573325L;

        @Override
        public void cleanCheckpointOnFailedStoring(CompletedCheckpoint completedCheckpoint, Executor executor) {
            cleanupCallCount.incrementAndGet();
            super.cleanCheckpointOnFailedStoring(completedCheckpoint, executor);
        }
    }).setCompletedCheckpointStore(completedCheckpointStore).setTimer(manuallyTriggeredScheduledExecutor).build();
    checkpointCoordinator.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    try {
        checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptId, checkpointIDCounter.getLast()), "unknown location");
        fail("CheckpointException should have been thrown.");
    } catch (CheckpointException e) {
        assertThat(e.getCheckpointFailureReason(), is(CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE));
    }
    assertThat(cleanupCallCount.get(), is(expectedCleanupCalls));
}
Also used : ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) Executor(java.util.concurrent.Executor) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph)

Example 34 with AcknowledgeCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.

the class CheckpointCoordinatorFailureTest method testFailingCompletedCheckpointStoreAdd.

/**
 * Tests that a failure while storing a completed checkpoint in the completed checkpoint store
 * will properly fail the originating pending checkpoint and clean upt the completed checkpoint.
 */
@Test
public void testFailingCompletedCheckpointStoreAdd() throws Exception {
    JobVertexID jobVertexId = new JobVertexID();
    final ManuallyTriggeredScheduledExecutor manuallyTriggeredScheduledExecutor = new ManuallyTriggeredScheduledExecutor();
    ExecutionGraph testGraph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexId).build();
    ExecutionVertex vertex = testGraph.getJobVertex(jobVertexId).getTaskVertices()[0];
    // set up the coordinator and validate the initial state
    CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(testGraph).setCompletedCheckpointStore(new FailingCompletedCheckpointStore(new Exception("The failing completed checkpoint store failed again... :-("))).setTimer(manuallyTriggeredScheduledExecutor).build();
    coord.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertEquals(1, coord.getNumberOfPendingCheckpoints());
    PendingCheckpoint pendingCheckpoint = coord.getPendingCheckpoints().values().iterator().next();
    assertFalse(pendingCheckpoint.isDisposed());
    final long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();
    KeyedStateHandle managedKeyedHandle = mock(KeyedStateHandle.class);
    KeyedStateHandle rawKeyedHandle = mock(KeyedStateHandle.class);
    OperatorStateHandle managedOpHandle = mock(OperatorStreamStateHandle.class);
    OperatorStateHandle rawOpHandle = mock(OperatorStreamStateHandle.class);
    InputChannelStateHandle inputChannelStateHandle = new InputChannelStateHandle(new InputChannelInfo(0, 1), mock(StreamStateHandle.class), Collections.singletonList(1L));
    ResultSubpartitionStateHandle resultSubpartitionStateHandle = new ResultSubpartitionStateHandle(new ResultSubpartitionInfo(0, 1), mock(StreamStateHandle.class), Collections.singletonList(1L));
    final OperatorSubtaskState operatorSubtaskState = spy(OperatorSubtaskState.builder().setManagedOperatorState(managedOpHandle).setRawOperatorState(rawOpHandle).setManagedKeyedState(managedKeyedHandle).setRawKeyedState(rawKeyedHandle).setInputChannelState(StateObjectCollection.singleton(inputChannelStateHandle)).setResultSubpartitionState(StateObjectCollection.singleton(resultSubpartitionStateHandle)).build());
    TaskStateSnapshot subtaskState = spy(new TaskStateSnapshot());
    subtaskState.putSubtaskStateByOperatorID(new OperatorID(), operatorSubtaskState);
    when(subtaskState.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(vertex.getJobvertexId()))).thenReturn(operatorSubtaskState);
    AcknowledgeCheckpoint acknowledgeMessage = new AcknowledgeCheckpoint(testGraph.getJobID(), vertex.getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), subtaskState);
    try {
        coord.receiveAcknowledgeMessage(acknowledgeMessage, "Unknown location");
        fail("Expected a checkpoint exception because the completed checkpoint store could not " + "store the completed checkpoint.");
    } catch (CheckpointException e) {
    // ignore because we expected this exception
    }
    // make sure that the pending checkpoint has been discarded after we could not complete it
    assertTrue(pendingCheckpoint.isDisposed());
    // make sure that the subtask state has been discarded after we could not complete it.
    verify(operatorSubtaskState).discardState();
    verify(operatorSubtaskState.getManagedOperatorState().iterator().next()).discardState();
    verify(operatorSubtaskState.getRawOperatorState().iterator().next()).discardState();
    verify(operatorSubtaskState.getManagedKeyedState().iterator().next()).discardState();
    verify(operatorSubtaskState.getRawKeyedState().iterator().next()).discardState();
    verify(operatorSubtaskState.getInputChannelState().iterator().next().getDelegate()).discardState();
    verify(operatorSubtaskState.getResultSubpartitionState().iterator().next().getDelegate()).discardState();
}
Also used : InputChannelInfo(org.apache.flink.runtime.checkpoint.channel.InputChannelInfo) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) KeyedStateHandle(org.apache.flink.runtime.state.KeyedStateHandle) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) FlinkRuntimeException(org.apache.flink.util.FlinkRuntimeException) PossibleInconsistentStateException(org.apache.flink.runtime.persistence.PossibleInconsistentStateException) ManuallyTriggeredScheduledExecutor(org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) OperatorStreamStateHandle(org.apache.flink.runtime.state.OperatorStreamStateHandle) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) ResultSubpartitionInfo(org.apache.flink.runtime.checkpoint.channel.ResultSubpartitionInfo) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) ResultSubpartitionStateHandle(org.apache.flink.runtime.state.ResultSubpartitionStateHandle) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) InputChannelStateHandle(org.apache.flink.runtime.state.InputChannelStateHandle) Test(org.junit.Test)

Example 35 with AcknowledgeCheckpoint

use of org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint in project flink by apache.

the class CheckpointCoordinatorRestoringTest method testRestoreLatestCheckpointedStateWithChangingParallelism.

/**
 * Tests the checkpoint restoration with changing parallelism of job vertex with partitioned
 * state.
 */
private void testRestoreLatestCheckpointedStateWithChangingParallelism(boolean scaleOut) throws Exception {
    final JobVertexID jobVertexID1 = new JobVertexID();
    final JobVertexID jobVertexID2 = new JobVertexID();
    int parallelism1 = 3;
    int parallelism2 = scaleOut ? 2 : 13;
    int maxParallelism1 = 42;
    int maxParallelism2 = 13;
    int newParallelism2 = scaleOut ? 13 : 2;
    CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
    final ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1, parallelism1, maxParallelism1).addJobVertex(jobVertexID2, parallelism2, maxParallelism2).build();
    final ExecutionJobVertex jobVertex1 = graph.getJobVertex(jobVertexID1);
    final ExecutionJobVertex jobVertex2 = graph.getJobVertex(jobVertexID2);
    // set up the coordinator and validate the initial state
    CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(completedCheckpointStore).setTimer(manuallyTriggeredScheduledExecutor).build();
    // trigger the checkpoint
    coord.triggerCheckpoint(false);
    manuallyTriggeredScheduledExecutor.triggerAll();
    assertEquals(1, coord.getPendingCheckpoints().size());
    long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
    List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
    List<KeyGroupRange> keyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, parallelism2);
    // vertex 1
    for (int index = 0; index < jobVertex1.getParallelism(); index++) {
        OperatorStateHandle opStateBackend = generatePartitionableStateHandle(jobVertexID1, index, 2, 8, false);
        KeyGroupsStateHandle keyedStateBackend = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), false);
        KeyGroupsStateHandle keyedStateRaw = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), true);
        OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(opStateBackend).setManagedKeyedState(keyedStateBackend).setRawKeyedState(keyedStateRaw).setInputChannelState(StateObjectCollection.singleton(createNewInputChannelStateHandle(3, new Random()))).build();
        TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
        taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID1), operatorSubtaskState);
        AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(graph.getJobID(), jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates);
        coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
    }
    // vertex 2
    final List<ChainedStateHandle<OperatorStateHandle>> expectedOpStatesBackend = new ArrayList<>(jobVertex2.getParallelism());
    final List<ChainedStateHandle<OperatorStateHandle>> expectedOpStatesRaw = new ArrayList<>(jobVertex2.getParallelism());
    for (int index = 0; index < jobVertex2.getParallelism(); index++) {
        KeyGroupsStateHandle keyedStateBackend = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), false);
        KeyGroupsStateHandle keyedStateRaw = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), true);
        OperatorStateHandle opStateBackend = generatePartitionableStateHandle(jobVertexID2, index, 2, 8, false);
        OperatorStateHandle opStateRaw = generatePartitionableStateHandle(jobVertexID2, index, 2, 8, true);
        expectedOpStatesBackend.add(new ChainedStateHandle<>(singletonList(opStateBackend)));
        expectedOpStatesRaw.add(new ChainedStateHandle<>(singletonList(opStateRaw)));
        OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(opStateBackend).setRawOperatorState(opStateRaw).setManagedKeyedState(keyedStateBackend).setRawKeyedState(keyedStateRaw).build();
        TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
        taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID2), operatorSubtaskState);
        AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(graph.getJobID(), jobVertex2.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates);
        coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
    }
    List<CompletedCheckpoint> completedCheckpoints = coord.getSuccessfulCheckpoints();
    assertEquals(1, completedCheckpoints.size());
    List<KeyGroupRange> newKeyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, newParallelism2);
    // rescale vertex 2
    final ExecutionGraph newGraph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1, parallelism1, maxParallelism1).addJobVertex(jobVertexID2, newParallelism2, maxParallelism2).build();
    final ExecutionJobVertex newJobVertex1 = newGraph.getJobVertex(jobVertexID1);
    final ExecutionJobVertex newJobVertex2 = newGraph.getJobVertex(jobVertexID2);
    // set up the coordinator and validate the initial state
    CheckpointCoordinator newCoord = new CheckpointCoordinatorBuilder().setExecutionGraph(newGraph).setCompletedCheckpointStore(completedCheckpointStore).setTimer(manuallyTriggeredScheduledExecutor).build();
    Set<ExecutionJobVertex> tasks = new HashSet<>();
    tasks.add(newJobVertex1);
    tasks.add(newJobVertex2);
    assertTrue(newCoord.restoreLatestCheckpointedStateToAll(tasks, false));
    // verify the restored state
    verifyStateRestore(jobVertexID1, newJobVertex1, keyGroupPartitions1);
    List<List<Collection<OperatorStateHandle>>> actualOpStatesBackend = new ArrayList<>(newJobVertex2.getParallelism());
    List<List<Collection<OperatorStateHandle>>> actualOpStatesRaw = new ArrayList<>(newJobVertex2.getParallelism());
    for (int i = 0; i < newJobVertex2.getParallelism(); i++) {
        List<OperatorIDPair> operatorIDs = newJobVertex2.getOperatorIDs();
        KeyGroupsStateHandle originalKeyedStateBackend = generateKeyGroupState(jobVertexID2, newKeyGroupPartitions2.get(i), false);
        KeyGroupsStateHandle originalKeyedStateRaw = generateKeyGroupState(jobVertexID2, newKeyGroupPartitions2.get(i), true);
        JobManagerTaskRestore taskRestore = newJobVertex2.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
        Assert.assertEquals(1L, taskRestore.getRestoreCheckpointId());
        TaskStateSnapshot taskStateHandles = taskRestore.getTaskStateSnapshot();
        final int headOpIndex = operatorIDs.size() - 1;
        List<Collection<OperatorStateHandle>> allParallelManagedOpStates = new ArrayList<>(operatorIDs.size());
        List<Collection<OperatorStateHandle>> allParallelRawOpStates = new ArrayList<>(operatorIDs.size());
        for (int idx = 0; idx < operatorIDs.size(); ++idx) {
            OperatorID operatorID = operatorIDs.get(idx).getGeneratedOperatorID();
            OperatorSubtaskState opState = taskStateHandles.getSubtaskStateByOperatorID(operatorID);
            Collection<OperatorStateHandle> opStateBackend = opState.getManagedOperatorState();
            Collection<OperatorStateHandle> opStateRaw = opState.getRawOperatorState();
            allParallelManagedOpStates.add(opStateBackend);
            allParallelRawOpStates.add(opStateRaw);
            if (idx == headOpIndex) {
                Collection<KeyedStateHandle> keyedStateBackend = opState.getManagedKeyedState();
                Collection<KeyedStateHandle> keyGroupStateRaw = opState.getRawKeyedState();
                compareKeyedState(singletonList(originalKeyedStateBackend), keyedStateBackend);
                compareKeyedState(singletonList(originalKeyedStateRaw), keyGroupStateRaw);
            }
        }
        actualOpStatesBackend.add(allParallelManagedOpStates);
        actualOpStatesRaw.add(allParallelRawOpStates);
    }
    comparePartitionableState(expectedOpStatesBackend, actualOpStatesBackend);
    comparePartitionableState(expectedOpStatesRaw, actualOpStatesRaw);
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) KeyGroupRange(org.apache.flink.runtime.state.KeyGroupRange) ArrayList(java.util.ArrayList) OperatorID(org.apache.flink.runtime.jobgraph.OperatorID) KeyedStateHandle(org.apache.flink.runtime.state.KeyedStateHandle) KeyGroupsStateHandle(org.apache.flink.runtime.state.KeyGroupsStateHandle) ChainedStateHandle(org.apache.flink.runtime.state.ChainedStateHandle) Random(java.util.Random) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) Collections.singletonList(java.util.Collections.singletonList) Collections.emptyList(java.util.Collections.emptyList) List(java.util.List) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) Collection(java.util.Collection) OperatorStateHandle(org.apache.flink.runtime.state.OperatorStateHandle) OperatorIDPair(org.apache.flink.runtime.OperatorIDPair)

Aggregations

AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)45 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)35 Test (org.junit.Test)33 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)32 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)29 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)29 CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)23 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)18 JobID (org.apache.flink.api.common.JobID)15 OperatorID (org.apache.flink.runtime.jobgraph.OperatorID)14 KeyGroupRange (org.apache.flink.runtime.state.KeyGroupRange)13 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)12 HashMap (java.util.HashMap)9 IOException (java.io.IOException)8 ArrayList (java.util.ArrayList)8 StreamStateHandle (org.apache.flink.runtime.state.StreamStateHandle)8 KeyGroupsStateHandle (org.apache.flink.runtime.state.KeyGroupsStateHandle)7 ByteStreamStateHandle (org.apache.flink.runtime.state.memory.ByteStreamStateHandle)7 ManuallyTriggeredScheduledExecutor (org.apache.flink.util.concurrent.ManuallyTriggeredScheduledExecutor)7 HashSet (java.util.HashSet)6