use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.
the class CheckpointCoordinatorFailureTest method testStoringFailureHandling.
private void testStoringFailureHandling(Exception failure, int expectedCleanupCalls) throws Exception {
final JobVertexID jobVertexID1 = new JobVertexID();
final ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).build();
final ExecutionVertex vertex = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
final ExecutionAttemptID attemptId = vertex.getCurrentExecutionAttempt().getAttemptId();
final StandaloneCheckpointIDCounter checkpointIDCounter = new StandaloneCheckpointIDCounter();
final ManuallyTriggeredScheduledExecutor manuallyTriggeredScheduledExecutor = new ManuallyTriggeredScheduledExecutor();
final CompletedCheckpointStore completedCheckpointStore = new FailingCompletedCheckpointStore(failure);
final AtomicInteger cleanupCallCount = new AtomicInteger(0);
final CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointIDCounter(checkpointIDCounter).setCheckpointsCleaner(new CheckpointsCleaner() {
private static final long serialVersionUID = 2029876992397573325L;
@Override
public void cleanCheckpointOnFailedStoring(CompletedCheckpoint completedCheckpoint, Executor executor) {
cleanupCallCount.incrementAndGet();
super.cleanCheckpointOnFailedStoring(completedCheckpoint, executor);
}
}).setCompletedCheckpointStore(completedCheckpointStore).setTimer(manuallyTriggeredScheduledExecutor).build();
checkpointCoordinator.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
try {
checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptId, checkpointIDCounter.getLast()), "unknown location");
fail("CheckpointException should have been thrown.");
} catch (CheckpointException e) {
assertThat(e.getCheckpointFailureReason(), is(CheckpointFailureReason.FINALIZE_CHECKPOINT_FAILURE));
}
assertThat(cleanupCallCount.get(), is(expectedCleanupCalls));
}
use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.
the class CheckpointCoordinatorFailureTest method testFailingCompletedCheckpointStoreAdd.
/**
* Tests that a failure while storing a completed checkpoint in the completed checkpoint store
* will properly fail the originating pending checkpoint and clean upt the completed checkpoint.
*/
@Test
public void testFailingCompletedCheckpointStoreAdd() throws Exception {
JobVertexID jobVertexId = new JobVertexID();
final ManuallyTriggeredScheduledExecutor manuallyTriggeredScheduledExecutor = new ManuallyTriggeredScheduledExecutor();
ExecutionGraph testGraph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexId).build();
ExecutionVertex vertex = testGraph.getJobVertex(jobVertexId).getTaskVertices()[0];
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(testGraph).setCompletedCheckpointStore(new FailingCompletedCheckpointStore(new Exception("The failing completed checkpoint store failed again... :-("))).setTimer(manuallyTriggeredScheduledExecutor).build();
coord.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(1, coord.getNumberOfPendingCheckpoints());
PendingCheckpoint pendingCheckpoint = coord.getPendingCheckpoints().values().iterator().next();
assertFalse(pendingCheckpoint.isDisposed());
final long checkpointId = coord.getPendingCheckpoints().keySet().iterator().next();
KeyedStateHandle managedKeyedHandle = mock(KeyedStateHandle.class);
KeyedStateHandle rawKeyedHandle = mock(KeyedStateHandle.class);
OperatorStateHandle managedOpHandle = mock(OperatorStreamStateHandle.class);
OperatorStateHandle rawOpHandle = mock(OperatorStreamStateHandle.class);
InputChannelStateHandle inputChannelStateHandle = new InputChannelStateHandle(new InputChannelInfo(0, 1), mock(StreamStateHandle.class), Collections.singletonList(1L));
ResultSubpartitionStateHandle resultSubpartitionStateHandle = new ResultSubpartitionStateHandle(new ResultSubpartitionInfo(0, 1), mock(StreamStateHandle.class), Collections.singletonList(1L));
final OperatorSubtaskState operatorSubtaskState = spy(OperatorSubtaskState.builder().setManagedOperatorState(managedOpHandle).setRawOperatorState(rawOpHandle).setManagedKeyedState(managedKeyedHandle).setRawKeyedState(rawKeyedHandle).setInputChannelState(StateObjectCollection.singleton(inputChannelStateHandle)).setResultSubpartitionState(StateObjectCollection.singleton(resultSubpartitionStateHandle)).build());
TaskStateSnapshot subtaskState = spy(new TaskStateSnapshot());
subtaskState.putSubtaskStateByOperatorID(new OperatorID(), operatorSubtaskState);
when(subtaskState.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(vertex.getJobvertexId()))).thenReturn(operatorSubtaskState);
AcknowledgeCheckpoint acknowledgeMessage = new AcknowledgeCheckpoint(testGraph.getJobID(), vertex.getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), subtaskState);
try {
coord.receiveAcknowledgeMessage(acknowledgeMessage, "Unknown location");
fail("Expected a checkpoint exception because the completed checkpoint store could not " + "store the completed checkpoint.");
} catch (CheckpointException e) {
// ignore because we expected this exception
}
// make sure that the pending checkpoint has been discarded after we could not complete it
assertTrue(pendingCheckpoint.isDisposed());
// make sure that the subtask state has been discarded after we could not complete it.
verify(operatorSubtaskState).discardState();
verify(operatorSubtaskState.getManagedOperatorState().iterator().next()).discardState();
verify(operatorSubtaskState.getRawOperatorState().iterator().next()).discardState();
verify(operatorSubtaskState.getManagedKeyedState().iterator().next()).discardState();
verify(operatorSubtaskState.getRawKeyedState().iterator().next()).discardState();
verify(operatorSubtaskState.getInputChannelState().iterator().next().getDelegate()).discardState();
verify(operatorSubtaskState.getResultSubpartitionState().iterator().next().getDelegate()).discardState();
}
use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.
the class CheckpointCoordinatorRestoringTest method testRestoreLatestCheckpointedStateWithChangingParallelism.
/**
* Tests the checkpoint restoration with changing parallelism of job vertex with partitioned
* state.
*/
private void testRestoreLatestCheckpointedStateWithChangingParallelism(boolean scaleOut) throws Exception {
final JobVertexID jobVertexID1 = new JobVertexID();
final JobVertexID jobVertexID2 = new JobVertexID();
int parallelism1 = 3;
int parallelism2 = scaleOut ? 2 : 13;
int maxParallelism1 = 42;
int maxParallelism2 = 13;
int newParallelism2 = scaleOut ? 13 : 2;
CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
final ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1, parallelism1, maxParallelism1).addJobVertex(jobVertexID2, parallelism2, maxParallelism2).build();
final ExecutionJobVertex jobVertex1 = graph.getJobVertex(jobVertexID1);
final ExecutionJobVertex jobVertex2 = graph.getJobVertex(jobVertexID2);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(completedCheckpointStore).setTimer(manuallyTriggeredScheduledExecutor).build();
// trigger the checkpoint
coord.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(1, coord.getPendingCheckpoints().size());
long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
List<KeyGroupRange> keyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, parallelism2);
// vertex 1
for (int index = 0; index < jobVertex1.getParallelism(); index++) {
OperatorStateHandle opStateBackend = generatePartitionableStateHandle(jobVertexID1, index, 2, 8, false);
KeyGroupsStateHandle keyedStateBackend = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), false);
KeyGroupsStateHandle keyedStateRaw = generateKeyGroupState(jobVertexID1, keyGroupPartitions1.get(index), true);
OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(opStateBackend).setManagedKeyedState(keyedStateBackend).setRawKeyedState(keyedStateRaw).setInputChannelState(StateObjectCollection.singleton(createNewInputChannelStateHandle(3, new Random()))).build();
TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID1), operatorSubtaskState);
AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(graph.getJobID(), jobVertex1.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates);
coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
}
// vertex 2
final List<ChainedStateHandle<OperatorStateHandle>> expectedOpStatesBackend = new ArrayList<>(jobVertex2.getParallelism());
final List<ChainedStateHandle<OperatorStateHandle>> expectedOpStatesRaw = new ArrayList<>(jobVertex2.getParallelism());
for (int index = 0; index < jobVertex2.getParallelism(); index++) {
KeyGroupsStateHandle keyedStateBackend = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), false);
KeyGroupsStateHandle keyedStateRaw = generateKeyGroupState(jobVertexID2, keyGroupPartitions2.get(index), true);
OperatorStateHandle opStateBackend = generatePartitionableStateHandle(jobVertexID2, index, 2, 8, false);
OperatorStateHandle opStateRaw = generatePartitionableStateHandle(jobVertexID2, index, 2, 8, true);
expectedOpStatesBackend.add(new ChainedStateHandle<>(singletonList(opStateBackend)));
expectedOpStatesRaw.add(new ChainedStateHandle<>(singletonList(opStateRaw)));
OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(opStateBackend).setRawOperatorState(opStateRaw).setManagedKeyedState(keyedStateBackend).setRawKeyedState(keyedStateRaw).build();
TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID2), operatorSubtaskState);
AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(graph.getJobID(), jobVertex2.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates);
coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
}
List<CompletedCheckpoint> completedCheckpoints = coord.getSuccessfulCheckpoints();
assertEquals(1, completedCheckpoints.size());
List<KeyGroupRange> newKeyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, newParallelism2);
// rescale vertex 2
final ExecutionGraph newGraph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1, parallelism1, maxParallelism1).addJobVertex(jobVertexID2, newParallelism2, maxParallelism2).build();
final ExecutionJobVertex newJobVertex1 = newGraph.getJobVertex(jobVertexID1);
final ExecutionJobVertex newJobVertex2 = newGraph.getJobVertex(jobVertexID2);
// set up the coordinator and validate the initial state
CheckpointCoordinator newCoord = new CheckpointCoordinatorBuilder().setExecutionGraph(newGraph).setCompletedCheckpointStore(completedCheckpointStore).setTimer(manuallyTriggeredScheduledExecutor).build();
Set<ExecutionJobVertex> tasks = new HashSet<>();
tasks.add(newJobVertex1);
tasks.add(newJobVertex2);
assertTrue(newCoord.restoreLatestCheckpointedStateToAll(tasks, false));
// verify the restored state
verifyStateRestore(jobVertexID1, newJobVertex1, keyGroupPartitions1);
List<List<Collection<OperatorStateHandle>>> actualOpStatesBackend = new ArrayList<>(newJobVertex2.getParallelism());
List<List<Collection<OperatorStateHandle>>> actualOpStatesRaw = new ArrayList<>(newJobVertex2.getParallelism());
for (int i = 0; i < newJobVertex2.getParallelism(); i++) {
List<OperatorIDPair> operatorIDs = newJobVertex2.getOperatorIDs();
KeyGroupsStateHandle originalKeyedStateBackend = generateKeyGroupState(jobVertexID2, newKeyGroupPartitions2.get(i), false);
KeyGroupsStateHandle originalKeyedStateRaw = generateKeyGroupState(jobVertexID2, newKeyGroupPartitions2.get(i), true);
JobManagerTaskRestore taskRestore = newJobVertex2.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
Assert.assertEquals(1L, taskRestore.getRestoreCheckpointId());
TaskStateSnapshot taskStateHandles = taskRestore.getTaskStateSnapshot();
final int headOpIndex = operatorIDs.size() - 1;
List<Collection<OperatorStateHandle>> allParallelManagedOpStates = new ArrayList<>(operatorIDs.size());
List<Collection<OperatorStateHandle>> allParallelRawOpStates = new ArrayList<>(operatorIDs.size());
for (int idx = 0; idx < operatorIDs.size(); ++idx) {
OperatorID operatorID = operatorIDs.get(idx).getGeneratedOperatorID();
OperatorSubtaskState opState = taskStateHandles.getSubtaskStateByOperatorID(operatorID);
Collection<OperatorStateHandle> opStateBackend = opState.getManagedOperatorState();
Collection<OperatorStateHandle> opStateRaw = opState.getRawOperatorState();
allParallelManagedOpStates.add(opStateBackend);
allParallelRawOpStates.add(opStateRaw);
if (idx == headOpIndex) {
Collection<KeyedStateHandle> keyedStateBackend = opState.getManagedKeyedState();
Collection<KeyedStateHandle> keyGroupStateRaw = opState.getRawKeyedState();
compareKeyedState(singletonList(originalKeyedStateBackend), keyedStateBackend);
compareKeyedState(singletonList(originalKeyedStateRaw), keyGroupStateRaw);
}
}
actualOpStatesBackend.add(allParallelManagedOpStates);
actualOpStatesRaw.add(allParallelRawOpStates);
}
comparePartitionableState(expectedOpStatesBackend, actualOpStatesBackend);
comparePartitionableState(expectedOpStatesRaw, actualOpStatesRaw);
}
use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.
the class CheckpointCoordinatorRestoringTest method testJobGraphModificationsAreCheckedForSavepoint.
@Test
public void testJobGraphModificationsAreCheckedForSavepoint() throws Exception {
final JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, 1, 1).build();
CheckpointCoordinator coordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setTimer(manuallyTriggeredScheduledExecutor).build();
File savepointPath = tmpFolder.newFolder();
CompletableFuture<CompletedCheckpoint> savepointFuture = coordinator.triggerSavepoint("file://" + savepointPath.getAbsolutePath(), SavepointFormatType.CANONICAL);
manuallyTriggeredScheduledExecutor.triggerAll();
long pendingSavepointId = coordinator.getPendingCheckpoints().keySet().stream().findFirst().get();
coordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), graph.getJobVertex(jobVertexID).getTaskVertices()[0].getCurrentExecutionAttempt().getAttemptId(), pendingSavepointId), "localhost");
assertTrue(savepointFuture.isDone());
BooleanValue checked = new BooleanValue(false);
CheckpointCoordinator restoreCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setVertexFinishedStateCheckerFactory((vertices, states) -> new VertexFinishedStateChecker(vertices, states) {
@Override
public void validateOperatorsFinishedState() {
checked.set(true);
}
}).build();
restoreCoordinator.restoreSavepoint(SavepointRestoreSettings.forPath(savepointFuture.get().getExternalPointer()), graph.getAllVertices(), getClass().getClassLoader());
assertTrue("The finished states should be checked when job is restored on startup", checked.get());
}
use of org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder in project flink by apache.
the class CheckpointCoordinatorRestoringTest method testRestoreLatestCheckpointedStateWithoutInFlightData.
@Test
public void testRestoreLatestCheckpointedStateWithoutInFlightData() throws Exception {
// given: Operator with not empty states.
final JobVertexID jobVertexID = new JobVertexID();
int parallelism1 = 3;
int maxParallelism1 = 42;
CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
final ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, parallelism1, maxParallelism1).build();
final ExecutionJobVertex jobVertex = graph.getJobVertex(jobVertexID);
// set up the coordinator and validate the initial state
CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(completedCheckpointStore).setCheckpointCoordinatorConfiguration(new CheckpointCoordinatorConfigurationBuilder().setCheckpointIdOfIgnoredInFlightData(1).build()).setTimer(manuallyTriggeredScheduledExecutor).build();
// trigger the checkpoint
coord.triggerCheckpoint(false);
manuallyTriggeredScheduledExecutor.triggerAll();
assertEquals(1, coord.getPendingCheckpoints().size());
long checkpointId = Iterables.getOnlyElement(coord.getPendingCheckpoints().keySet());
List<KeyGroupRange> keyGroupPartitions1 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism1, parallelism1);
Random random = new Random();
// fill the states and complete the checkpoint.
for (int index = 0; index < jobVertex.getParallelism(); index++) {
OperatorSubtaskState operatorSubtaskState = OperatorSubtaskState.builder().setManagedOperatorState(generatePartitionableStateHandle(jobVertexID, index, 2, 8, false)).setRawOperatorState(generatePartitionableStateHandle(jobVertexID, index, 2, 8, true)).setManagedKeyedState(generateKeyGroupState(jobVertexID, keyGroupPartitions1.get(index), false)).setRawKeyedState(generateKeyGroupState(jobVertexID, keyGroupPartitions1.get(index), true)).setInputChannelState(StateObjectCollection.singleton(createNewInputChannelStateHandle(3, random))).setResultSubpartitionState(StateObjectCollection.singleton(createNewResultSubpartitionStateHandle(3, random))).build();
TaskStateSnapshot taskOperatorSubtaskStates = new TaskStateSnapshot();
taskOperatorSubtaskStates.putSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID), operatorSubtaskState);
AcknowledgeCheckpoint acknowledgeCheckpoint = new AcknowledgeCheckpoint(graph.getJobID(), jobVertex.getTaskVertices()[index].getCurrentExecutionAttempt().getAttemptId(), checkpointId, new CheckpointMetrics(), taskOperatorSubtaskStates);
coord.receiveAcknowledgeMessage(acknowledgeCheckpoint, TASK_MANAGER_LOCATION_INFO);
}
assertEquals(1, coord.getSuccessfulCheckpoints().size());
// when: Restore latest checkpoint without in-flight data.
Set<ExecutionJobVertex> tasks = new HashSet<>();
tasks.add(jobVertex);
assertTrue(coord.restoreLatestCheckpointedStateToAll(tasks, false));
// then: All states should be restored successfully except InputChannel and
// ResultSubpartition which should be ignored.
verifyStateRestore(jobVertexID, jobVertex, keyGroupPartitions1);
for (int i = 0; i < jobVertex.getParallelism(); i++) {
JobManagerTaskRestore taskRestore = jobVertex.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
Assert.assertEquals(1L, taskRestore.getRestoreCheckpointId());
TaskStateSnapshot stateSnapshot = taskRestore.getTaskStateSnapshot();
OperatorSubtaskState operatorState = stateSnapshot.getSubtaskStateByOperatorID(OperatorID.fromJobVertexID(jobVertexID));
assertTrue(operatorState.getInputChannelState().isEmpty());
assertTrue(operatorState.getResultSubpartitionState().isEmpty());
assertFalse(operatorState.getRawOperatorState().isEmpty());
assertFalse(operatorState.getManagedOperatorState().isEmpty());
assertFalse(operatorState.getRawKeyedState().isEmpty());
assertFalse(operatorState.getManagedOperatorState().isEmpty());
}
}
Aggregations