Search in sources :

Example 1 with CompletedCheckpoint

use of org.apache.flink.runtime.checkpoint.CompletedCheckpoint in project flink by apache.

the class SavepointLoader method loadAndValidateSavepoint.

/**
	 * Loads a savepoint back as a {@link CompletedCheckpoint}.
	 *
	 * <p>This method verifies that tasks and parallelism still match the savepoint parameters.
	 *
	 * @param jobId          The JobID of the job to load the savepoint for.
	 * @param tasks          Tasks that will possibly be reset
	 * @param savepointPath  The path of the savepoint to rollback to
	 * @param classLoader    The class loader to resolve serialized classes in legacy savepoint versions.
	 * @param allowNonRestoredState Allow to skip checkpoint state that cannot be mapped
	 * to any job vertex in tasks.
	 *
	 * @throws IllegalStateException If mismatch between program and savepoint state
	 * @throws IOException             If savepoint store failure
	 */
public static CompletedCheckpoint loadAndValidateSavepoint(JobID jobId, Map<JobVertexID, ExecutionJobVertex> tasks, String savepointPath, ClassLoader classLoader, boolean allowNonRestoredState) throws IOException {
    // (1) load the savepoint
    final Tuple2<Savepoint, StreamStateHandle> savepointAndHandle = SavepointStore.loadSavepointWithHandle(savepointPath, classLoader);
    final Savepoint savepoint = savepointAndHandle.f0;
    final StreamStateHandle metadataHandle = savepointAndHandle.f1;
    final Map<JobVertexID, TaskState> taskStates = new HashMap<>(savepoint.getTaskStates().size());
    boolean expandedToLegacyIds = false;
    // (2) validate it (parallelism, etc)
    for (TaskState taskState : savepoint.getTaskStates()) {
        ExecutionJobVertex executionJobVertex = tasks.get(taskState.getJobVertexID());
        // for example as generated from older flink versions, to provide backwards compatibility.
        if (executionJobVertex == null && !expandedToLegacyIds) {
            tasks = ExecutionJobVertex.includeLegacyJobVertexIDs(tasks);
            executionJobVertex = tasks.get(taskState.getJobVertexID());
            expandedToLegacyIds = true;
            LOG.info("Could not find ExecutionJobVertex. Including legacy JobVertexIDs in search.");
        }
        if (executionJobVertex != null) {
            if (executionJobVertex.getMaxParallelism() == taskState.getMaxParallelism() || !executionJobVertex.isMaxParallelismConfigured()) {
                taskStates.put(taskState.getJobVertexID(), taskState);
            } else {
                String msg = String.format("Failed to rollback to savepoint %s. " + "Max parallelism mismatch between savepoint state and new program. " + "Cannot map operator %s with max parallelism %d to new program with " + "max parallelism %d. This indicates that the program has been changed " + "in a non-compatible way after the savepoint.", savepoint, taskState.getJobVertexID(), taskState.getMaxParallelism(), executionJobVertex.getMaxParallelism());
                throw new IllegalStateException(msg);
            }
        } else if (allowNonRestoredState) {
            LOG.info("Skipping savepoint state for operator {}.", taskState.getJobVertexID());
        } else {
            String msg = String.format("Failed to rollback to savepoint %s. " + "Cannot map savepoint state for operator %s to the new program, " + "because the operator is not available in the new program. If " + "you want to allow to skip this, you can set the --allowNonRestoredState " + "option on the CLI.", savepointPath, taskState.getJobVertexID());
            throw new IllegalStateException(msg);
        }
    }
    // (3) convert to checkpoint so the system can fall back to it
    CheckpointProperties props = CheckpointProperties.forStandardSavepoint();
    return new CompletedCheckpoint(jobId, savepoint.getCheckpointId(), 0L, 0L, taskStates, props, metadataHandle, savepointPath);
}
Also used : CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) HashMap(java.util.HashMap) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) StreamStateHandle(org.apache.flink.runtime.state.StreamStateHandle) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) TaskState(org.apache.flink.runtime.checkpoint.TaskState) CheckpointProperties(org.apache.flink.runtime.checkpoint.CheckpointProperties)

Example 2 with CompletedCheckpoint

use of org.apache.flink.runtime.checkpoint.CompletedCheckpoint in project flink by apache.

the class SavepointLoaderTest method testLoadAndValidateSavepoint.

/**
	 * Tests loading and validation of savepoints with correct setup,
	 * parallelism mismatch, and a missing task.
	 */
@Test
public void testLoadAndValidateSavepoint() throws Exception {
    File tmp = tmpFolder.newFolder();
    int parallelism = 128128;
    long checkpointId = Integer.MAX_VALUE + 123123L;
    JobVertexID vertexId = new JobVertexID();
    TaskState state = mock(TaskState.class);
    when(state.getParallelism()).thenReturn(parallelism);
    when(state.getJobVertexID()).thenReturn(vertexId);
    when(state.getMaxParallelism()).thenReturn(parallelism);
    when(state.getChainLength()).thenReturn(1);
    Map<JobVertexID, TaskState> taskStates = new HashMap<>();
    taskStates.put(vertexId, state);
    JobID jobId = new JobID();
    // Store savepoint
    SavepointV1 savepoint = new SavepointV1(checkpointId, taskStates.values());
    String path = SavepointStore.storeSavepoint(tmp.getAbsolutePath(), savepoint);
    ExecutionJobVertex vertex = mock(ExecutionJobVertex.class);
    when(vertex.getParallelism()).thenReturn(parallelism);
    when(vertex.getMaxParallelism()).thenReturn(parallelism);
    Map<JobVertexID, ExecutionJobVertex> tasks = new HashMap<>();
    tasks.put(vertexId, vertex);
    ClassLoader ucl = Thread.currentThread().getContextClassLoader();
    // 1) Load and validate: everything correct
    CompletedCheckpoint loaded = SavepointLoader.loadAndValidateSavepoint(jobId, tasks, path, ucl, false);
    assertEquals(jobId, loaded.getJobId());
    assertEquals(checkpointId, loaded.getCheckpointID());
    // 2) Load and validate: max parallelism mismatch
    when(vertex.getMaxParallelism()).thenReturn(222);
    when(vertex.isMaxParallelismConfigured()).thenReturn(true);
    try {
        SavepointLoader.loadAndValidateSavepoint(jobId, tasks, path, ucl, false);
        fail("Did not throw expected Exception");
    } catch (IllegalStateException expected) {
        assertTrue(expected.getMessage().contains("Max parallelism mismatch"));
    }
    // 3) Load and validate: missing vertex
    assertNotNull(tasks.remove(vertexId));
    try {
        SavepointLoader.loadAndValidateSavepoint(jobId, tasks, path, ucl, false);
        fail("Did not throw expected Exception");
    } catch (IllegalStateException expected) {
        assertTrue(expected.getMessage().contains("allowNonRestoredState"));
    }
    // 4) Load and validate: ignore missing vertex
    SavepointLoader.loadAndValidateSavepoint(jobId, tasks, path, ucl, true);
}
Also used : CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) HashMap(java.util.HashMap) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) CompletedCheckpoint(org.apache.flink.runtime.checkpoint.CompletedCheckpoint) ExecutionJobVertex(org.apache.flink.runtime.executiongraph.ExecutionJobVertex) File(java.io.File) TaskState(org.apache.flink.runtime.checkpoint.TaskState) JobID(org.apache.flink.api.common.JobID) Test(org.junit.Test)

Aggregations

HashMap (java.util.HashMap)2 CompletedCheckpoint (org.apache.flink.runtime.checkpoint.CompletedCheckpoint)2 TaskState (org.apache.flink.runtime.checkpoint.TaskState)2 ExecutionJobVertex (org.apache.flink.runtime.executiongraph.ExecutionJobVertex)2 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)2 File (java.io.File)1 JobID (org.apache.flink.api.common.JobID)1 CheckpointProperties (org.apache.flink.runtime.checkpoint.CheckpointProperties)1 StreamStateHandle (org.apache.flink.runtime.state.StreamStateHandle)1 Test (org.junit.Test)1