use of org.apache.flink.runtime.state.testutils.TestCompletedCheckpointStorageLocation in project flink by apache.
the class CheckpointCoordinatorRestoringTest method testStateRecoveryWithTopologyChange.
/**
* old topology. [operator1,operator2] * parallelism1 -> [operator3,operator4] * parallelism2
*
* <p>new topology
*
* <p>[operator5,operator1,operator3] * newParallelism1 -> [operator3, operator6] *
* newParallelism2
*/
public void testStateRecoveryWithTopologyChange(TestScaleType scaleType) throws Exception {
/*
* Old topology
* CHAIN(op1 -> op2) * parallelism1 -> CHAIN(op3 -> op4) * parallelism2
*/
Tuple2<JobVertexID, OperatorID> id1 = generateIDPair();
Tuple2<JobVertexID, OperatorID> id2 = generateIDPair();
int parallelism1 = 10;
int maxParallelism1 = 64;
Tuple2<JobVertexID, OperatorID> id3 = generateIDPair();
Tuple2<JobVertexID, OperatorID> id4 = generateIDPair();
int parallelism2 = 10;
int maxParallelism2 = 64;
List<KeyGroupRange> keyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, parallelism2);
Map<OperatorID, OperatorState> operatorStates = new HashMap<>();
// prepare vertex1 state
for (Tuple2<JobVertexID, OperatorID> id : Arrays.asList(id1, id2)) {
OperatorState taskState = new OperatorState(id.f1, parallelism1, maxParallelism1);
operatorStates.put(id.f1, taskState);
for (int index = 0; index < taskState.getParallelism(); index++) {
OperatorSubtaskState subtaskState = OperatorSubtaskState.builder().setManagedOperatorState(generatePartitionableStateHandle(id.f0, index, 2, 8, false)).setRawOperatorState(generatePartitionableStateHandle(id.f0, index, 2, 8, true)).build();
taskState.putState(index, subtaskState);
}
}
List<List<ChainedStateHandle<OperatorStateHandle>>> expectedManagedOperatorStates = new ArrayList<>();
List<List<ChainedStateHandle<OperatorStateHandle>>> expectedRawOperatorStates = new ArrayList<>();
// prepare vertex2 state
for (Tuple2<JobVertexID, OperatorID> id : Arrays.asList(id3, id4)) {
OperatorState operatorState = new OperatorState(id.f1, parallelism2, maxParallelism2);
operatorStates.put(id.f1, operatorState);
List<ChainedStateHandle<OperatorStateHandle>> expectedManagedOperatorState = new ArrayList<>();
List<ChainedStateHandle<OperatorStateHandle>> expectedRawOperatorState = new ArrayList<>();
expectedManagedOperatorStates.add(expectedManagedOperatorState);
expectedRawOperatorStates.add(expectedRawOperatorState);
for (int index = 0; index < operatorState.getParallelism(); index++) {
final OperatorSubtaskState.Builder stateBuilder = OperatorSubtaskState.builder();
OperatorStateHandle subManagedOperatorState = generateChainedPartitionableStateHandle(id.f0, index, 2, 8, false).get(0);
OperatorStateHandle subRawOperatorState = generateChainedPartitionableStateHandle(id.f0, index, 2, 8, true).get(0);
if (id.f0.equals(id3.f0)) {
stateBuilder.setManagedKeyedState(generateKeyGroupState(id.f0, keyGroupPartitions2.get(index), false));
}
if (id.f0.equals(id3.f0)) {
stateBuilder.setRawKeyedState(generateKeyGroupState(id.f0, keyGroupPartitions2.get(index), true));
}
expectedManagedOperatorState.add(ChainedStateHandle.wrapSingleHandle(subManagedOperatorState));
expectedRawOperatorState.add(ChainedStateHandle.wrapSingleHandle(subRawOperatorState));
OperatorSubtaskState subtaskState = stateBuilder.setManagedOperatorState(subManagedOperatorState).setRawOperatorState(subRawOperatorState).build();
operatorState.putState(index, subtaskState);
}
}
/*
* New topology
* CHAIN(op5 -> op1 -> op2) * newParallelism1 -> CHAIN(op3 -> op6) * newParallelism2
*/
Tuple2<JobVertexID, OperatorID> id5 = generateIDPair();
int newParallelism1 = 10;
Tuple2<JobVertexID, OperatorID> id6 = generateIDPair();
int newParallelism2 = parallelism2;
if (scaleType == TestScaleType.INCREASE_PARALLELISM) {
newParallelism2 = 20;
} else if (scaleType == TestScaleType.DECREASE_PARALLELISM) {
newParallelism2 = 8;
}
List<KeyGroupRange> newKeyGroupPartitions2 = StateAssignmentOperation.createKeyGroupPartitions(maxParallelism2, newParallelism2);
ExecutionGraph newGraph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(id5.f0, newParallelism1, maxParallelism1, Stream.of(id2.f1, id1.f1, id5.f1).map(OperatorIDPair::generatedIDOnly).collect(Collectors.toList()), true).addJobVertex(id3.f0, newParallelism2, maxParallelism2, Stream.of(id6.f1, id3.f1).map(OperatorIDPair::generatedIDOnly).collect(Collectors.toList()), true).build();
ExecutionJobVertex newJobVertex1 = newGraph.getJobVertex(id5.f0);
ExecutionJobVertex newJobVertex2 = newGraph.getJobVertex(id3.f0);
Set<ExecutionJobVertex> tasks = new HashSet<>();
tasks.add(newJobVertex1);
tasks.add(newJobVertex2);
CompletedCheckpoint completedCheckpoint = new CompletedCheckpoint(newGraph.getJobID(), 2, System.currentTimeMillis(), System.currentTimeMillis() + 3000, operatorStates, Collections.<MasterState>emptyList(), CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new TestCompletedCheckpointStorageLocation());
// set up the coordinator and validate the initial state
SharedStateRegistry sharedStateRegistry = SharedStateRegistry.DEFAULT_FACTORY.create(Executors.directExecutor(), emptyList());
CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(newGraph).setCompletedCheckpointStore(storeFor(sharedStateRegistry, () -> {
}, completedCheckpoint)).setTimer(manuallyTriggeredScheduledExecutor).build();
coord.restoreLatestCheckpointedStateToAll(tasks, true);
for (int i = 0; i < newJobVertex1.getParallelism(); i++) {
final List<OperatorIDPair> operatorIDs = newJobVertex1.getOperatorIDs();
JobManagerTaskRestore taskRestore = newJobVertex1.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
Assert.assertEquals(2L, taskRestore.getRestoreCheckpointId());
TaskStateSnapshot stateSnapshot = taskRestore.getTaskStateSnapshot();
OperatorSubtaskState headOpState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIDs.size() - 1).getGeneratedOperatorID());
assertTrue(headOpState.getManagedKeyedState().isEmpty());
assertTrue(headOpState.getRawKeyedState().isEmpty());
// operator5
{
int operatorIndexInChain = 2;
OperatorSubtaskState opState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIndexInChain).getGeneratedOperatorID());
assertTrue(opState.getManagedOperatorState().isEmpty());
assertTrue(opState.getRawOperatorState().isEmpty());
}
// operator1
{
int operatorIndexInChain = 1;
OperatorSubtaskState opState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIndexInChain).getGeneratedOperatorID());
OperatorStateHandle expectedManagedOpState = generatePartitionableStateHandle(id1.f0, i, 2, 8, false);
OperatorStateHandle expectedRawOpState = generatePartitionableStateHandle(id1.f0, i, 2, 8, true);
Collection<OperatorStateHandle> managedOperatorState = opState.getManagedOperatorState();
assertEquals(1, managedOperatorState.size());
assertTrue(CommonTestUtils.isStreamContentEqual(expectedManagedOpState.openInputStream(), managedOperatorState.iterator().next().openInputStream()));
Collection<OperatorStateHandle> rawOperatorState = opState.getRawOperatorState();
assertEquals(1, rawOperatorState.size());
assertTrue(CommonTestUtils.isStreamContentEqual(expectedRawOpState.openInputStream(), rawOperatorState.iterator().next().openInputStream()));
}
// operator2
{
int operatorIndexInChain = 0;
OperatorSubtaskState opState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIndexInChain).getGeneratedOperatorID());
OperatorStateHandle expectedManagedOpState = generatePartitionableStateHandle(id2.f0, i, 2, 8, false);
OperatorStateHandle expectedRawOpState = generatePartitionableStateHandle(id2.f0, i, 2, 8, true);
Collection<OperatorStateHandle> managedOperatorState = opState.getManagedOperatorState();
assertEquals(1, managedOperatorState.size());
assertTrue(CommonTestUtils.isStreamContentEqual(expectedManagedOpState.openInputStream(), managedOperatorState.iterator().next().openInputStream()));
Collection<OperatorStateHandle> rawOperatorState = opState.getRawOperatorState();
assertEquals(1, rawOperatorState.size());
assertTrue(CommonTestUtils.isStreamContentEqual(expectedRawOpState.openInputStream(), rawOperatorState.iterator().next().openInputStream()));
}
}
List<List<Collection<OperatorStateHandle>>> actualManagedOperatorStates = new ArrayList<>(newJobVertex2.getParallelism());
List<List<Collection<OperatorStateHandle>>> actualRawOperatorStates = new ArrayList<>(newJobVertex2.getParallelism());
for (int i = 0; i < newJobVertex2.getParallelism(); i++) {
final List<OperatorIDPair> operatorIDs = newJobVertex2.getOperatorIDs();
JobManagerTaskRestore taskRestore = newJobVertex2.getTaskVertices()[i].getCurrentExecutionAttempt().getTaskRestore();
Assert.assertEquals(2L, taskRestore.getRestoreCheckpointId());
TaskStateSnapshot stateSnapshot = taskRestore.getTaskStateSnapshot();
// operator 3
{
int operatorIndexInChain = 1;
OperatorSubtaskState opState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIndexInChain).getGeneratedOperatorID());
List<Collection<OperatorStateHandle>> actualSubManagedOperatorState = new ArrayList<>(1);
actualSubManagedOperatorState.add(opState.getManagedOperatorState());
List<Collection<OperatorStateHandle>> actualSubRawOperatorState = new ArrayList<>(1);
actualSubRawOperatorState.add(opState.getRawOperatorState());
actualManagedOperatorStates.add(actualSubManagedOperatorState);
actualRawOperatorStates.add(actualSubRawOperatorState);
}
// operator 6
{
int operatorIndexInChain = 0;
OperatorSubtaskState opState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIndexInChain).getGeneratedOperatorID());
assertTrue(opState.getManagedOperatorState().isEmpty());
assertTrue(opState.getRawOperatorState().isEmpty());
}
KeyGroupsStateHandle originalKeyedStateBackend = generateKeyGroupState(id3.f0, newKeyGroupPartitions2.get(i), false);
KeyGroupsStateHandle originalKeyedStateRaw = generateKeyGroupState(id3.f0, newKeyGroupPartitions2.get(i), true);
OperatorSubtaskState headOpState = stateSnapshot.getSubtaskStateByOperatorID(operatorIDs.get(operatorIDs.size() - 1).getGeneratedOperatorID());
Collection<KeyedStateHandle> keyedStateBackend = headOpState.getManagedKeyedState();
Collection<KeyedStateHandle> keyGroupStateRaw = headOpState.getRawKeyedState();
compareKeyedState(singletonList(originalKeyedStateBackend), keyedStateBackend);
compareKeyedState(singletonList(originalKeyedStateRaw), keyGroupStateRaw);
}
comparePartitionableState(expectedManagedOperatorStates.get(0), actualManagedOperatorStates);
comparePartitionableState(expectedRawOperatorStates.get(0), actualRawOperatorStates);
}
use of org.apache.flink.runtime.state.testutils.TestCompletedCheckpointStorageLocation in project flink by apache.
the class CheckpointCoordinatorRestoringTest method testRestoreFinishedStateWithoutInFlightData.
@Test
public void testRestoreFinishedStateWithoutInFlightData() throws Exception {
// given: Operator with not empty states.
OperatorIDPair op1 = OperatorIDPair.generatedIDOnly(new OperatorID());
final JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, 1, 1, singletonList(op1), true).build();
CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
Map<OperatorID, OperatorState> operatorStates = new HashMap<>();
operatorStates.put(op1.getGeneratedOperatorID(), new FullyFinishedOperatorState(op1.getGeneratedOperatorID(), 1, 1));
CompletedCheckpoint completedCheckpoint = new CompletedCheckpoint(graph.getJobID(), 2, System.currentTimeMillis(), System.currentTimeMillis() + 3000, operatorStates, Collections.emptyList(), CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new TestCompletedCheckpointStorageLocation());
completedCheckpointStore.addCheckpointAndSubsumeOldestOne(completedCheckpoint, new CheckpointsCleaner(), () -> {
});
CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(new CheckpointCoordinatorConfigurationBuilder().setCheckpointIdOfIgnoredInFlightData(2).build()).setCompletedCheckpointStore(completedCheckpointStore).build();
ExecutionJobVertex vertex = graph.getJobVertex(jobVertexID);
coord.restoreInitialCheckpointIfPresent(Collections.singleton(vertex));
TaskStateSnapshot restoredState = vertex.getTaskVertices()[0].getCurrentExecutionAttempt().getTaskRestore().getTaskStateSnapshot();
assertTrue(restoredState.isTaskDeployedAsFinished());
}
use of org.apache.flink.runtime.state.testutils.TestCompletedCheckpointStorageLocation in project flink by apache.
the class CheckpointCoordinatorRestoringTest method testJobGraphModificationsAreCheckedForInitialCheckpoint.
@Test
public void testJobGraphModificationsAreCheckedForInitialCheckpoint() throws Exception {
final JobVertexID jobVertexID = new JobVertexID();
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID, 1, 1).build();
CompletedCheckpointStore completedCheckpointStore = new EmbeddedCompletedCheckpointStore();
CompletedCheckpoint completedCheckpoint = new CompletedCheckpoint(graph.getJobID(), 2, System.currentTimeMillis(), System.currentTimeMillis() + 3000, Collections.emptyMap(), Collections.emptyList(), CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new TestCompletedCheckpointStorageLocation());
completedCheckpointStore.addCheckpointAndSubsumeOldestOne(completedCheckpoint, new CheckpointsCleaner(), () -> {
});
BooleanValue checked = new BooleanValue(false);
CheckpointCoordinator restoreCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCompletedCheckpointStore(completedCheckpointStore).setVertexFinishedStateCheckerFactory((vertices, states) -> new VertexFinishedStateChecker(vertices, states) {
@Override
public void validateOperatorsFinishedState() {
checked.set(true);
}
}).build();
restoreCoordinator.restoreInitialCheckpointIfPresent(new HashSet<>(graph.getAllVertices().values()));
assertTrue("The finished states should be checked when job is restored on startup", checked.get());
}
use of org.apache.flink.runtime.state.testutils.TestCompletedCheckpointStorageLocation in project flink by apache.
the class CheckpointCoordinatorMasterHooksTest method testHooksAreCalledOnRestore.
@Test
public void testHooksAreCalledOnRestore() throws Exception {
final String id1 = "id1";
final String id2 = "id2";
final String state1 = "the-test-string-state";
final byte[] state1serialized = new StringSerializer().serialize(state1);
final long state2 = 987654321L;
final byte[] state2serialized = new LongSerializer().serialize(state2);
final List<MasterState> masterHookStates = Arrays.asList(new MasterState(id1, state1serialized, StringSerializer.VERSION), new MasterState(id2, state2serialized, LongSerializer.VERSION));
final MasterTriggerRestoreHook<String> statefulHook1 = mockGeneric(MasterTriggerRestoreHook.class);
when(statefulHook1.getIdentifier()).thenReturn(id1);
when(statefulHook1.createCheckpointDataSerializer()).thenReturn(new StringSerializer());
when(statefulHook1.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class))).thenThrow(new Exception("not expected"));
final MasterTriggerRestoreHook<Long> statefulHook2 = mockGeneric(MasterTriggerRestoreHook.class);
when(statefulHook2.getIdentifier()).thenReturn(id2);
when(statefulHook2.createCheckpointDataSerializer()).thenReturn(new LongSerializer());
when(statefulHook2.triggerCheckpoint(anyLong(), anyLong(), any(Executor.class))).thenThrow(new Exception("not expected"));
final MasterTriggerRestoreHook<Void> statelessHook = mockGeneric(MasterTriggerRestoreHook.class);
when(statelessHook.getIdentifier()).thenReturn("some-id");
final JobID jid = new JobID();
final long checkpointId = 13L;
final CompletedCheckpoint checkpoint = new CompletedCheckpoint(jid, checkpointId, 123L, 125L, Collections.<OperatorID, OperatorState>emptyMap(), masterHookStates, CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new TestCompletedCheckpointStorageLocation());
ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(new JobVertexID()).build();
CheckpointCoordinator cc = instantiateCheckpointCoordinator(graph);
cc.addMasterHook(statefulHook1);
cc.addMasterHook(statelessHook);
cc.addMasterHook(statefulHook2);
cc.getCheckpointStore().addCheckpointAndSubsumeOldestOne(checkpoint, new CheckpointsCleaner(), () -> {
});
cc.restoreLatestCheckpointedStateToAll(Collections.emptySet(), false);
verify(statefulHook1, times(1)).restoreCheckpoint(eq(checkpointId), eq(state1));
verify(statefulHook2, times(1)).restoreCheckpoint(eq(checkpointId), eq(state2));
verify(statelessHook, times(1)).restoreCheckpoint(eq(checkpointId), isNull(Void.class));
}
use of org.apache.flink.runtime.state.testutils.TestCompletedCheckpointStorageLocation in project flink by apache.
the class CheckpointStateRestoreTest method testNonRestoredState.
/**
* Tests that the allow non restored state flag is correctly handled.
*
* <p>The flag only applies for state that is part of the checkpoint.
*/
@Test
public void testNonRestoredState() throws Exception {
// --- (1) Create tasks to restore checkpoint with ---
JobVertexID jobVertexId1 = new JobVertexID();
JobVertexID jobVertexId2 = new JobVertexID();
OperatorID operatorId1 = OperatorID.fromJobVertexID(jobVertexId1);
// 1st JobVertex
ExecutionVertex vertex11 = mockExecutionVertex(mockExecution(), jobVertexId1, 0, 3);
ExecutionVertex vertex12 = mockExecutionVertex(mockExecution(), jobVertexId1, 1, 3);
ExecutionVertex vertex13 = mockExecutionVertex(mockExecution(), jobVertexId1, 2, 3);
// 2nd JobVertex
ExecutionVertex vertex21 = mockExecutionVertex(mockExecution(), jobVertexId2, 0, 2);
ExecutionVertex vertex22 = mockExecutionVertex(mockExecution(), jobVertexId2, 1, 2);
ExecutionJobVertex jobVertex1 = mockExecutionJobVertex(jobVertexId1, new ExecutionVertex[] { vertex11, vertex12, vertex13 });
ExecutionJobVertex jobVertex2 = mockExecutionJobVertex(jobVertexId2, new ExecutionVertex[] { vertex21, vertex22 });
Set<ExecutionJobVertex> tasks = new HashSet<>();
tasks.add(jobVertex1);
tasks.add(jobVertex2);
CheckpointCoordinator coord = new CheckpointCoordinatorBuilder().build();
// --- (2) Checkpoint misses state for a jobVertex (should work) ---
Map<OperatorID, OperatorState> checkpointTaskStates = new HashMap<>();
{
OperatorState taskState = new OperatorState(operatorId1, 3, 3);
taskState.putState(0, OperatorSubtaskState.builder().build());
taskState.putState(1, OperatorSubtaskState.builder().build());
taskState.putState(2, OperatorSubtaskState.builder().build());
checkpointTaskStates.put(operatorId1, taskState);
}
CompletedCheckpoint checkpoint = new CompletedCheckpoint(new JobID(), 0, 1, 2, new HashMap<>(checkpointTaskStates), Collections.<MasterState>emptyList(), CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new TestCompletedCheckpointStorageLocation());
coord.getCheckpointStore().addCheckpointAndSubsumeOldestOne(checkpoint, new CheckpointsCleaner(), () -> {
});
assertTrue(coord.restoreLatestCheckpointedStateToAll(tasks, false));
assertTrue(coord.restoreLatestCheckpointedStateToAll(tasks, true));
// --- (3) JobVertex missing for task state that is part of the checkpoint ---
JobVertexID newJobVertexID = new JobVertexID();
OperatorID newOperatorID = OperatorID.fromJobVertexID(newJobVertexID);
// There is no task for this
{
OperatorState taskState = new OperatorState(newOperatorID, 1, 1);
taskState.putState(0, OperatorSubtaskState.builder().build());
checkpointTaskStates.put(newOperatorID, taskState);
}
checkpoint = new CompletedCheckpoint(new JobID(), 1, 2, 3, new HashMap<>(checkpointTaskStates), Collections.<MasterState>emptyList(), CheckpointProperties.forCheckpoint(CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION), new TestCompletedCheckpointStorageLocation());
coord.getCheckpointStore().addCheckpointAndSubsumeOldestOne(checkpoint, new CheckpointsCleaner(), () -> {
});
// (i) Allow non restored state (should succeed)
final boolean restored = coord.restoreLatestCheckpointedStateToAll(tasks, true);
assertTrue(restored);
// (ii) Don't allow non restored state (should fail)
try {
coord.restoreLatestCheckpointedStateToAll(tasks, false);
fail("Did not throw the expected Exception.");
} catch (IllegalStateException ignored) {
}
}
Aggregations