use of org.apache.flink.runtime.checkpoint.TaskState in project flink by apache.
the class SavepointV1Serializer method serialize.
@Override
public void serialize(SavepointV1 savepoint, DataOutputStream dos) throws IOException {
try {
dos.writeLong(savepoint.getCheckpointId());
Collection<TaskState> taskStates = savepoint.getTaskStates();
dos.writeInt(taskStates.size());
for (TaskState taskState : savepoint.getTaskStates()) {
// Vertex ID
dos.writeLong(taskState.getJobVertexID().getLowerPart());
dos.writeLong(taskState.getJobVertexID().getUpperPart());
// Parallelism
int parallelism = taskState.getParallelism();
dos.writeInt(parallelism);
dos.writeInt(taskState.getMaxParallelism());
dos.writeInt(taskState.getChainLength());
// Sub task states
Map<Integer, SubtaskState> subtaskStateMap = taskState.getSubtaskStates();
dos.writeInt(subtaskStateMap.size());
for (Map.Entry<Integer, SubtaskState> entry : subtaskStateMap.entrySet()) {
dos.writeInt(entry.getKey());
serializeSubtaskState(entry.getValue(), dos);
}
}
} catch (Exception e) {
throw new IOException(e);
}
}
use of org.apache.flink.runtime.checkpoint.TaskState in project flink by apache.
the class SavepointLoader method loadAndValidateSavepoint.
/**
* Loads a savepoint back as a {@link CompletedCheckpoint}.
*
* <p>This method verifies that tasks and parallelism still match the savepoint parameters.
*
* @param jobId The JobID of the job to load the savepoint for.
* @param tasks Tasks that will possibly be reset
* @param savepointPath The path of the savepoint to rollback to
* @param classLoader The class loader to resolve serialized classes in legacy savepoint versions.
* @param allowNonRestoredState Allow to skip checkpoint state that cannot be mapped
* to any job vertex in tasks.
*
* @throws IllegalStateException If mismatch between program and savepoint state
* @throws IOException If savepoint store failure
*/
public static CompletedCheckpoint loadAndValidateSavepoint(JobID jobId, Map<JobVertexID, ExecutionJobVertex> tasks, String savepointPath, ClassLoader classLoader, boolean allowNonRestoredState) throws IOException {
// (1) load the savepoint
final Tuple2<Savepoint, StreamStateHandle> savepointAndHandle = SavepointStore.loadSavepointWithHandle(savepointPath, classLoader);
final Savepoint savepoint = savepointAndHandle.f0;
final StreamStateHandle metadataHandle = savepointAndHandle.f1;
final Map<JobVertexID, TaskState> taskStates = new HashMap<>(savepoint.getTaskStates().size());
boolean expandedToLegacyIds = false;
// (2) validate it (parallelism, etc)
for (TaskState taskState : savepoint.getTaskStates()) {
ExecutionJobVertex executionJobVertex = tasks.get(taskState.getJobVertexID());
// for example as generated from older flink versions, to provide backwards compatibility.
if (executionJobVertex == null && !expandedToLegacyIds) {
tasks = ExecutionJobVertex.includeLegacyJobVertexIDs(tasks);
executionJobVertex = tasks.get(taskState.getJobVertexID());
expandedToLegacyIds = true;
LOG.info("Could not find ExecutionJobVertex. Including legacy JobVertexIDs in search.");
}
if (executionJobVertex != null) {
if (executionJobVertex.getMaxParallelism() == taskState.getMaxParallelism() || !executionJobVertex.isMaxParallelismConfigured()) {
taskStates.put(taskState.getJobVertexID(), taskState);
} else {
String msg = String.format("Failed to rollback to savepoint %s. " + "Max parallelism mismatch between savepoint state and new program. " + "Cannot map operator %s with max parallelism %d to new program with " + "max parallelism %d. This indicates that the program has been changed " + "in a non-compatible way after the savepoint.", savepoint, taskState.getJobVertexID(), taskState.getMaxParallelism(), executionJobVertex.getMaxParallelism());
throw new IllegalStateException(msg);
}
} else if (allowNonRestoredState) {
LOG.info("Skipping savepoint state for operator {}.", taskState.getJobVertexID());
} else {
String msg = String.format("Failed to rollback to savepoint %s. " + "Cannot map savepoint state for operator %s to the new program, " + "because the operator is not available in the new program. If " + "you want to allow to skip this, you can set the --allowNonRestoredState " + "option on the CLI.", savepointPath, taskState.getJobVertexID());
throw new IllegalStateException(msg);
}
}
// (3) convert to checkpoint so the system can fall back to it
CheckpointProperties props = CheckpointProperties.forStandardSavepoint();
return new CompletedCheckpoint(jobId, savepoint.getCheckpointId(), 0L, 0L, taskStates, props, metadataHandle, savepointPath);
}
use of org.apache.flink.runtime.checkpoint.TaskState in project flink by apache.
the class MigrationV0ToV1Test method testSavepointMigrationV0ToV1.
/**
* Simple test of savepoint methods.
*/
@Test
public void testSavepointMigrationV0ToV1() throws Exception {
String target = tmp.getRoot().getAbsolutePath();
assertEquals(0, tmp.getRoot().listFiles().length);
long checkpointId = ThreadLocalRandom.current().nextLong(Integer.MAX_VALUE);
int numTaskStates = 4;
int numSubtaskStates = 16;
Collection<org.apache.flink.migration.runtime.checkpoint.TaskState> expected = createTaskStatesOld(numTaskStates, numSubtaskStates);
SavepointV0 savepoint = new SavepointV0(checkpointId, expected);
assertEquals(SavepointV0.VERSION, savepoint.getVersion());
assertEquals(checkpointId, savepoint.getCheckpointId());
assertEquals(expected, savepoint.getOldTaskStates());
assertFalse(savepoint.getOldTaskStates().isEmpty());
Exception latestException = null;
Path path = null;
FSDataOutputStream fdos = null;
FileSystem fs = null;
try {
// Try to create a FS output stream
for (int attempt = 0; attempt < 10; attempt++) {
path = new Path(target, FileUtils.getRandomFilename("savepoint-"));
if (fs == null) {
fs = FileSystem.get(path.toUri());
}
try {
fdos = fs.create(path, false);
break;
} catch (Exception e) {
latestException = e;
}
}
if (fdos == null) {
throw new IOException("Failed to create file output stream at " + path, latestException);
}
try (DataOutputStream dos = new DataOutputStream(fdos)) {
dos.writeInt(SavepointStore.MAGIC_NUMBER);
dos.writeInt(savepoint.getVersion());
SavepointV0Serializer.INSTANCE.serializeOld(savepoint, dos);
}
ClassLoader cl = Thread.currentThread().getContextClassLoader();
Savepoint sp = SavepointStore.loadSavepoint(path.toString(), cl);
int t = 0;
for (TaskState taskState : sp.getTaskStates()) {
for (int p = 0; p < taskState.getParallelism(); ++p) {
SubtaskState subtaskState = taskState.getState(p);
ChainedStateHandle<StreamStateHandle> legacyOperatorState = subtaskState.getLegacyOperatorState();
for (int c = 0; c < legacyOperatorState.getLength(); ++c) {
StreamStateHandle stateHandle = legacyOperatorState.get(c);
try (InputStream is = stateHandle.openInputStream()) {
Tuple4<Integer, Integer, Integer, Integer> expTestState = new Tuple4<>(0, t, p, c);
Tuple4<Integer, Integer, Integer, Integer> actTestState;
//check function state
if (p % 4 != 0) {
assertEquals(1, is.read());
actTestState = InstantiationUtil.deserializeObject(is, cl);
assertEquals(expTestState, actTestState);
} else {
assertEquals(0, is.read());
}
//check operator state
expTestState.f0 = 1;
actTestState = InstantiationUtil.deserializeObject(is, cl);
assertEquals(expTestState, actTestState);
}
}
//check keyed state
KeyGroupsStateHandle keyGroupsStateHandle = subtaskState.getManagedKeyedState();
if (t % 3 != 0) {
assertEquals(1, keyGroupsStateHandle.getNumberOfKeyGroups());
assertEquals(p, keyGroupsStateHandle.getGroupRangeOffsets().getKeyGroupRange().getStartKeyGroup());
ByteStreamStateHandle stateHandle = (ByteStreamStateHandle) keyGroupsStateHandle.getDelegateStateHandle();
HashMap<String, KvStateSnapshot<?, ?, ?, ?>> testKeyedState = MigrationInstantiationUtil.deserializeObject(stateHandle.getData(), cl);
assertEquals(2, testKeyedState.size());
for (KvStateSnapshot<?, ?, ?, ?> snapshot : testKeyedState.values()) {
MemValueState.Snapshot<?, ?, ?> castedSnapshot = (MemValueState.Snapshot<?, ?, ?>) snapshot;
byte[] data = castedSnapshot.getData();
assertEquals(t, data[0]);
assertEquals(p, data[1]);
}
} else {
assertEquals(null, keyGroupsStateHandle);
}
}
++t;
}
savepoint.dispose();
} finally {
// Dispose
SavepointStore.removeSavepointFile(path.toString());
}
}
use of org.apache.flink.runtime.checkpoint.TaskState in project flink by apache.
the class SavepointLoaderTest method testLoadAndValidateSavepoint.
/**
* Tests loading and validation of savepoints with correct setup,
* parallelism mismatch, and a missing task.
*/
@Test
public void testLoadAndValidateSavepoint() throws Exception {
File tmp = tmpFolder.newFolder();
int parallelism = 128128;
long checkpointId = Integer.MAX_VALUE + 123123L;
JobVertexID vertexId = new JobVertexID();
TaskState state = mock(TaskState.class);
when(state.getParallelism()).thenReturn(parallelism);
when(state.getJobVertexID()).thenReturn(vertexId);
when(state.getMaxParallelism()).thenReturn(parallelism);
when(state.getChainLength()).thenReturn(1);
Map<JobVertexID, TaskState> taskStates = new HashMap<>();
taskStates.put(vertexId, state);
JobID jobId = new JobID();
// Store savepoint
SavepointV1 savepoint = new SavepointV1(checkpointId, taskStates.values());
String path = SavepointStore.storeSavepoint(tmp.getAbsolutePath(), savepoint);
ExecutionJobVertex vertex = mock(ExecutionJobVertex.class);
when(vertex.getParallelism()).thenReturn(parallelism);
when(vertex.getMaxParallelism()).thenReturn(parallelism);
Map<JobVertexID, ExecutionJobVertex> tasks = new HashMap<>();
tasks.put(vertexId, vertex);
ClassLoader ucl = Thread.currentThread().getContextClassLoader();
// 1) Load and validate: everything correct
CompletedCheckpoint loaded = SavepointLoader.loadAndValidateSavepoint(jobId, tasks, path, ucl, false);
assertEquals(jobId, loaded.getJobId());
assertEquals(checkpointId, loaded.getCheckpointID());
// 2) Load and validate: max parallelism mismatch
when(vertex.getMaxParallelism()).thenReturn(222);
when(vertex.isMaxParallelismConfigured()).thenReturn(true);
try {
SavepointLoader.loadAndValidateSavepoint(jobId, tasks, path, ucl, false);
fail("Did not throw expected Exception");
} catch (IllegalStateException expected) {
assertTrue(expected.getMessage().contains("Max parallelism mismatch"));
}
// 3) Load and validate: missing vertex
assertNotNull(tasks.remove(vertexId));
try {
SavepointLoader.loadAndValidateSavepoint(jobId, tasks, path, ucl, false);
fail("Did not throw expected Exception");
} catch (IllegalStateException expected) {
assertTrue(expected.getMessage().contains("allowNonRestoredState"));
}
// 4) Load and validate: ignore missing vertex
SavepointLoader.loadAndValidateSavepoint(jobId, tasks, path, ucl, true);
}
use of org.apache.flink.runtime.checkpoint.TaskState in project flink by apache.
the class SavepointV1Test method createTaskStates.
static Collection<TaskState> createTaskStates(int numTaskStates, int numSubtasksPerTask) throws IOException {
Random random = new Random(numTaskStates * 31 + numSubtasksPerTask);
List<TaskState> taskStates = new ArrayList<>(numTaskStates);
for (int stateIdx = 0; stateIdx < numTaskStates; ++stateIdx) {
int chainLength = 1 + random.nextInt(8);
TaskState taskState = new TaskState(new JobVertexID(), numSubtasksPerTask, 128, chainLength);
int noNonPartitionableStateAtIndex = random.nextInt(chainLength);
int noOperatorStateBackendAtIndex = random.nextInt(chainLength);
int noOperatorStateStreamAtIndex = random.nextInt(chainLength);
boolean hasKeyedBackend = random.nextInt(4) != 0;
boolean hasKeyedStream = random.nextInt(4) != 0;
for (int subtaskIdx = 0; subtaskIdx < numSubtasksPerTask; subtaskIdx++) {
List<StreamStateHandle> nonPartitionableStates = new ArrayList<>(chainLength);
List<OperatorStateHandle> operatorStatesBackend = new ArrayList<>(chainLength);
List<OperatorStateHandle> operatorStatesStream = new ArrayList<>(chainLength);
for (int chainIdx = 0; chainIdx < chainLength; ++chainIdx) {
StreamStateHandle nonPartitionableState = new TestByteStreamStateHandleDeepCompare("a-" + chainIdx, ("Hi-" + chainIdx).getBytes(ConfigConstants.DEFAULT_CHARSET));
StreamStateHandle operatorStateBackend = new TestByteStreamStateHandleDeepCompare("b-" + chainIdx, ("Beautiful-" + chainIdx).getBytes(ConfigConstants.DEFAULT_CHARSET));
StreamStateHandle operatorStateStream = new TestByteStreamStateHandleDeepCompare("b-" + chainIdx, ("Beautiful-" + chainIdx).getBytes(ConfigConstants.DEFAULT_CHARSET));
Map<String, OperatorStateHandle.StateMetaInfo> offsetsMap = new HashMap<>();
offsetsMap.put("A", new OperatorStateHandle.StateMetaInfo(new long[] { 0, 10, 20 }, OperatorStateHandle.Mode.SPLIT_DISTRIBUTE));
offsetsMap.put("B", new OperatorStateHandle.StateMetaInfo(new long[] { 30, 40, 50 }, OperatorStateHandle.Mode.SPLIT_DISTRIBUTE));
offsetsMap.put("C", new OperatorStateHandle.StateMetaInfo(new long[] { 60, 70, 80 }, OperatorStateHandle.Mode.BROADCAST));
if (chainIdx != noNonPartitionableStateAtIndex) {
nonPartitionableStates.add(nonPartitionableState);
}
if (chainIdx != noOperatorStateBackendAtIndex) {
OperatorStateHandle operatorStateHandleBackend = new OperatorStateHandle(offsetsMap, operatorStateBackend);
operatorStatesBackend.add(operatorStateHandleBackend);
}
if (chainIdx != noOperatorStateStreamAtIndex) {
OperatorStateHandle operatorStateHandleStream = new OperatorStateHandle(offsetsMap, operatorStateStream);
operatorStatesStream.add(operatorStateHandleStream);
}
}
KeyGroupsStateHandle keyedStateBackend = null;
KeyGroupsStateHandle keyedStateStream = null;
if (hasKeyedBackend) {
keyedStateBackend = new KeyGroupsStateHandle(new KeyGroupRangeOffsets(1, 1, new long[] { 42 }), new TestByteStreamStateHandleDeepCompare("c", "Hello".getBytes(ConfigConstants.DEFAULT_CHARSET)));
}
if (hasKeyedStream) {
keyedStateStream = new KeyGroupsStateHandle(new KeyGroupRangeOffsets(1, 1, new long[] { 23 }), new TestByteStreamStateHandleDeepCompare("d", "World".getBytes(ConfigConstants.DEFAULT_CHARSET)));
}
taskState.putState(subtaskIdx, new SubtaskState(new ChainedStateHandle<>(nonPartitionableStates), new ChainedStateHandle<>(operatorStatesBackend), new ChainedStateHandle<>(operatorStatesStream), keyedStateStream, keyedStateBackend));
}
taskStates.add(taskState);
}
return taskStates;
}
Aggregations