Search in sources :

Example 1 with CheckpointCoordinatorConfiguration

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.

the class DispatcherCleanupITCase method createJobGraph.

private JobGraph createJobGraph() {
    final JobVertex firstVertex = new JobVertex("first");
    firstVertex.setInvokableClass(NoOpInvokable.class);
    firstVertex.setParallelism(1);
    final JobVertex secondVertex = new JobVertex("second");
    secondVertex.setInvokableClass(NoOpInvokable.class);
    secondVertex.setParallelism(1);
    final CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = CheckpointCoordinatorConfiguration.builder().setCheckpointInterval(20L).setMinPauseBetweenCheckpoints(20L).setCheckpointTimeout(10_000L).build();
    final JobCheckpointingSettings checkpointingSettings = new JobCheckpointingSettings(checkpointCoordinatorConfiguration, null);
    return JobGraphBuilder.newStreamingJobGraphBuilder().addJobVertex(firstVertex).addJobVertex(secondVertex).setJobCheckpointingSettings(checkpointingSettings).build();
}
Also used : JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration)

Example 2 with CheckpointCoordinatorConfiguration

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.

the class DefaultExecutionGraphBuilder method buildGraph.

public static DefaultExecutionGraph buildGraph(JobGraph jobGraph, Configuration jobManagerConfig, ScheduledExecutorService futureExecutor, Executor ioExecutor, ClassLoader classLoader, CompletedCheckpointStore completedCheckpointStore, CheckpointsCleaner checkpointsCleaner, CheckpointIDCounter checkpointIdCounter, Time rpcTimeout, BlobWriter blobWriter, Logger log, ShuffleMaster<?> shuffleMaster, JobMasterPartitionTracker partitionTracker, TaskDeploymentDescriptorFactory.PartitionLocationConstraint partitionLocationConstraint, ExecutionDeploymentListener executionDeploymentListener, ExecutionStateUpdateListener executionStateUpdateListener, long initializationTimestamp, VertexAttemptNumberStore vertexAttemptNumberStore, VertexParallelismStore vertexParallelismStore, Supplier<CheckpointStatsTracker> checkpointStatsTrackerFactory, boolean isDynamicGraph) throws JobExecutionException, JobException {
    checkNotNull(jobGraph, "job graph cannot be null");
    final String jobName = jobGraph.getName();
    final JobID jobId = jobGraph.getJobID();
    final JobInformation jobInformation = new JobInformation(jobId, jobName, jobGraph.getSerializedExecutionConfig(), jobGraph.getJobConfiguration(), jobGraph.getUserJarBlobKeys(), jobGraph.getClasspaths());
    final int maxPriorAttemptsHistoryLength = jobManagerConfig.getInteger(JobManagerOptions.MAX_ATTEMPTS_HISTORY_SIZE);
    final PartitionGroupReleaseStrategy.Factory partitionGroupReleaseStrategyFactory = PartitionGroupReleaseStrategyFactoryLoader.loadPartitionGroupReleaseStrategyFactory(jobManagerConfig);
    // create a new execution graph, if none exists so far
    final DefaultExecutionGraph executionGraph;
    try {
        executionGraph = new DefaultExecutionGraph(jobInformation, futureExecutor, ioExecutor, rpcTimeout, maxPriorAttemptsHistoryLength, classLoader, blobWriter, partitionGroupReleaseStrategyFactory, shuffleMaster, partitionTracker, partitionLocationConstraint, executionDeploymentListener, executionStateUpdateListener, initializationTimestamp, vertexAttemptNumberStore, vertexParallelismStore, isDynamicGraph);
    } catch (IOException e) {
        throw new JobException("Could not create the ExecutionGraph.", e);
    }
    try {
        executionGraph.setJsonPlan(JsonPlanGenerator.generatePlan(jobGraph));
    } catch (Throwable t) {
        log.warn("Cannot create JSON plan for job", t);
        // give the graph an empty plan
        executionGraph.setJsonPlan("{}");
    }
    // initialize the vertices that have a master initialization hook
    // file output formats create directories here, input formats create splits
    final long initMasterStart = System.nanoTime();
    log.info("Running initialization on master for job {} ({}).", jobName, jobId);
    for (JobVertex vertex : jobGraph.getVertices()) {
        String executableClass = vertex.getInvokableClassName();
        if (executableClass == null || executableClass.isEmpty()) {
            throw new JobSubmissionException(jobId, "The vertex " + vertex.getID() + " (" + vertex.getName() + ") has no invokable class.");
        }
        try {
            vertex.initializeOnMaster(classLoader);
        } catch (Throwable t) {
            throw new JobExecutionException(jobId, "Cannot initialize task '" + vertex.getName() + "': " + t.getMessage(), t);
        }
    }
    log.info("Successfully ran initialization on master in {} ms.", (System.nanoTime() - initMasterStart) / 1_000_000);
    // topologically sort the job vertices and attach the graph to the existing one
    List<JobVertex> sortedTopology = jobGraph.getVerticesSortedTopologicallyFromSources();
    if (log.isDebugEnabled()) {
        log.debug("Adding {} vertices from job graph {} ({}).", sortedTopology.size(), jobName, jobId);
    }
    executionGraph.attachJobGraph(sortedTopology);
    if (log.isDebugEnabled()) {
        log.debug("Successfully created execution graph from job graph {} ({}).", jobName, jobId);
    }
    // configure the state checkpointing
    if (isDynamicGraph) {
        // dynamic graph does not support checkpointing so we skip it
        log.warn("Skip setting up checkpointing for a job with dynamic graph.");
    } else if (isCheckpointingEnabled(jobGraph)) {
        JobCheckpointingSettings snapshotSettings = jobGraph.getCheckpointingSettings();
        // load the state backend from the application settings
        final StateBackend applicationConfiguredBackend;
        final SerializedValue<StateBackend> serializedAppConfigured = snapshotSettings.getDefaultStateBackend();
        if (serializedAppConfigured == null) {
            applicationConfiguredBackend = null;
        } else {
            try {
                applicationConfiguredBackend = serializedAppConfigured.deserializeValue(classLoader);
            } catch (IOException | ClassNotFoundException e) {
                throw new JobExecutionException(jobId, "Could not deserialize application-defined state backend.", e);
            }
        }
        final StateBackend rootBackend;
        try {
            rootBackend = StateBackendLoader.fromApplicationOrConfigOrDefault(applicationConfiguredBackend, snapshotSettings.isChangelogStateBackendEnabled(), jobManagerConfig, classLoader, log);
        } catch (IllegalConfigurationException | IOException | DynamicCodeLoadingException e) {
            throw new JobExecutionException(jobId, "Could not instantiate configured state backend", e);
        }
        // load the checkpoint storage from the application settings
        final CheckpointStorage applicationConfiguredStorage;
        final SerializedValue<CheckpointStorage> serializedAppConfiguredStorage = snapshotSettings.getDefaultCheckpointStorage();
        if (serializedAppConfiguredStorage == null) {
            applicationConfiguredStorage = null;
        } else {
            try {
                applicationConfiguredStorage = serializedAppConfiguredStorage.deserializeValue(classLoader);
            } catch (IOException | ClassNotFoundException e) {
                throw new JobExecutionException(jobId, "Could not deserialize application-defined checkpoint storage.", e);
            }
        }
        final CheckpointStorage rootStorage;
        try {
            rootStorage = CheckpointStorageLoader.load(applicationConfiguredStorage, null, rootBackend, jobManagerConfig, classLoader, log);
        } catch (IllegalConfigurationException | DynamicCodeLoadingException e) {
            throw new JobExecutionException(jobId, "Could not instantiate configured checkpoint storage", e);
        }
        // instantiate the user-defined checkpoint hooks
        final SerializedValue<MasterTriggerRestoreHook.Factory[]> serializedHooks = snapshotSettings.getMasterHooks();
        final List<MasterTriggerRestoreHook<?>> hooks;
        if (serializedHooks == null) {
            hooks = Collections.emptyList();
        } else {
            final MasterTriggerRestoreHook.Factory[] hookFactories;
            try {
                hookFactories = serializedHooks.deserializeValue(classLoader);
            } catch (IOException | ClassNotFoundException e) {
                throw new JobExecutionException(jobId, "Could not instantiate user-defined checkpoint hooks", e);
            }
            final Thread thread = Thread.currentThread();
            final ClassLoader originalClassLoader = thread.getContextClassLoader();
            thread.setContextClassLoader(classLoader);
            try {
                hooks = new ArrayList<>(hookFactories.length);
                for (MasterTriggerRestoreHook.Factory factory : hookFactories) {
                    hooks.add(MasterHooks.wrapHook(factory.create(), classLoader));
                }
            } finally {
                thread.setContextClassLoader(originalClassLoader);
            }
        }
        final CheckpointCoordinatorConfiguration chkConfig = snapshotSettings.getCheckpointCoordinatorConfiguration();
        executionGraph.enableCheckpointing(chkConfig, hooks, checkpointIdCounter, completedCheckpointStore, rootBackend, rootStorage, checkpointStatsTrackerFactory.get(), checkpointsCleaner);
    }
    return executionGraph;
}
Also used : ArrayList(java.util.ArrayList) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) TaskDeploymentDescriptorFactory(org.apache.flink.runtime.deployment.TaskDeploymentDescriptorFactory) JobSubmissionException(org.apache.flink.runtime.client.JobSubmissionException) StateBackend(org.apache.flink.runtime.state.StateBackend) MasterTriggerRestoreHook(org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook) JobException(org.apache.flink.runtime.JobException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) CheckpointStorage(org.apache.flink.runtime.state.CheckpointStorage) ArrayList(java.util.ArrayList) List(java.util.List) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) IOException(java.io.IOException) SerializedValue(org.apache.flink.util.SerializedValue) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) JobID(org.apache.flink.api.common.JobID) PartitionGroupReleaseStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.partitionrelease.PartitionGroupReleaseStrategy)

Example 3 with CheckpointCoordinatorConfiguration

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.

the class CheckpointConfigHandler method createCheckpointConfigInfo.

private static CheckpointConfigInfo createCheckpointConfigInfo(AccessExecutionGraph executionGraph) throws RestHandlerException {
    final CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = executionGraph.getCheckpointCoordinatorConfiguration();
    if (checkpointCoordinatorConfiguration == null) {
        throw new RestHandlerException("Checkpointing is not enabled for this job (" + executionGraph.getJobID() + ").", HttpResponseStatus.NOT_FOUND, RestHandlerException.LoggingBehavior.IGNORE);
    } else {
        CheckpointRetentionPolicy retentionPolicy = checkpointCoordinatorConfiguration.getCheckpointRetentionPolicy();
        CheckpointConfigInfo.ExternalizedCheckpointInfo externalizedCheckpointInfo = new CheckpointConfigInfo.ExternalizedCheckpointInfo(retentionPolicy != CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION, retentionPolicy != CheckpointRetentionPolicy.RETAIN_ON_CANCELLATION);
        String stateBackendName = executionGraph.getStateBackendName().orElse(null);
        String checkpointStorageName = executionGraph.getCheckpointStorageName().orElse(null);
        return new CheckpointConfigInfo(checkpointCoordinatorConfiguration.isExactlyOnce() ? CheckpointConfigInfo.ProcessingMode.EXACTLY_ONCE : CheckpointConfigInfo.ProcessingMode.AT_LEAST_ONCE, checkpointCoordinatorConfiguration.getCheckpointInterval(), checkpointCoordinatorConfiguration.getCheckpointTimeout(), checkpointCoordinatorConfiguration.getMinPauseBetweenCheckpoints(), checkpointCoordinatorConfiguration.getMaxConcurrentCheckpoints(), externalizedCheckpointInfo, stateBackendName, checkpointStorageName, checkpointCoordinatorConfiguration.isUnalignedCheckpointsEnabled(), checkpointCoordinatorConfiguration.getTolerableCheckpointFailureNumber(), checkpointCoordinatorConfiguration.getAlignedCheckpointTimeout(), checkpointCoordinatorConfiguration.isEnableCheckpointsAfterTasksFinish());
    }
}
Also used : CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) CheckpointConfigInfo(org.apache.flink.runtime.rest.messages.checkpoints.CheckpointConfigInfo) CheckpointRetentionPolicy(org.apache.flink.runtime.checkpoint.CheckpointRetentionPolicy) RestHandlerException(org.apache.flink.runtime.rest.handler.RestHandlerException)

Example 4 with CheckpointCoordinatorConfiguration

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.

the class CheckpointCoordinatorTest method testMaxConcurrentAttemptsWithSubsumption.

@Test
public void testMaxConcurrentAttemptsWithSubsumption() throws Exception {
    final int maxConcurrentAttempts = 2;
    JobVertexID jobVertexID1 = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    ExecutionAttemptID attemptID1 = vertex1.getCurrentExecutionAttempt().getAttemptId();
    CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 10 ms
    10).setCheckpointTimeout(// timeout is very long (200 s)
    200000).setMinPauseBetweenCheckpoints(// no extra delay
    0L).setMaxConcurrentCheckpoints(maxConcurrentAttempts).build();
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setTimer(manuallyTriggeredScheduledExecutor).build();
    checkpointCoordinator.startCheckpointScheduler();
    do {
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
    } while (checkpointCoordinator.getNumberOfPendingCheckpoints() < maxConcurrentAttempts);
    // validate that the pending checkpoints are there
    assertEquals(maxConcurrentAttempts, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertNotNull(checkpointCoordinator.getPendingCheckpoints().get(1L));
    assertNotNull(checkpointCoordinator.getPendingCheckpoints().get(2L));
    // now we acknowledge the second checkpoint, which should subsume the first checkpoint
    // and allow two more checkpoints to be triggered
    // now, once we acknowledge one checkpoint, it should trigger the next one
    checkpointCoordinator.receiveAcknowledgeMessage(new AcknowledgeCheckpoint(graph.getJobID(), attemptID1, 2L), TASK_MANAGER_LOCATION_INFO);
    // after a while, there should be the new checkpoints
    do {
        manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
        manuallyTriggeredScheduledExecutor.triggerAll();
    } while (checkpointCoordinator.getNumberOfPendingCheckpoints() < maxConcurrentAttempts);
    // do the final check
    assertEquals(maxConcurrentAttempts, checkpointCoordinator.getNumberOfPendingCheckpoints());
    assertNotNull(checkpointCoordinator.getPendingCheckpoints().get(3L));
    assertNotNull(checkpointCoordinator.getPendingCheckpoints().get(4L));
    checkpointCoordinator.shutdown();
}
Also used : AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) ExecutionAttemptID(org.apache.flink.runtime.executiongraph.ExecutionAttemptID) JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) AcknowledgeCheckpoint(org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint) DeclineCheckpoint(org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder) Test(org.junit.Test)

Example 5 with CheckpointCoordinatorConfiguration

use of org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration in project flink by apache.

the class CheckpointCoordinatorTest method setupCheckpointCoordinatorWithInactiveTasks.

private CheckpointCoordinator setupCheckpointCoordinatorWithInactiveTasks(CheckpointStorage checkpointStorage) throws Exception {
    JobVertexID jobVertexID1 = new JobVertexID();
    ExecutionGraph graph = new CheckpointCoordinatorTestingUtils.CheckpointExecutionGraphBuilder().addJobVertex(jobVertexID1).setTransitToRunning(false).build();
    ExecutionVertex vertex1 = graph.getJobVertex(jobVertexID1).getTaskVertices()[0];
    CheckpointCoordinatorConfiguration chkConfig = new CheckpointCoordinatorConfiguration.CheckpointCoordinatorConfigurationBuilder().setCheckpointInterval(// periodic interval is 10 ms
    10).setCheckpointTimeout(// timeout is very long (200 s)
    200000).setMinPauseBetweenCheckpoints(// no extra delay
    0).setMaxConcurrentCheckpoints(// max two concurrent checkpoints
    2).build();
    CheckpointIDCounterWithOwner checkpointIDCounter = new CheckpointIDCounterWithOwner();
    CheckpointCoordinator checkpointCoordinator = new CheckpointCoordinatorBuilder().setExecutionGraph(graph).setCheckpointCoordinatorConfiguration(chkConfig).setCompletedCheckpointStore(new StandaloneCompletedCheckpointStore(2)).setCheckpointStorage(checkpointStorage).setTimer(manuallyTriggeredScheduledExecutor).setCheckpointIDCounter(checkpointIDCounter).build();
    checkpointIDCounter.setOwner(checkpointCoordinator);
    checkpointCoordinator.startCheckpointScheduler();
    manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
    manuallyTriggeredScheduledExecutor.triggerAll();
    // no checkpoint should have started so far
    assertEquals(0, checkpointCoordinator.getNumberOfPendingCheckpoints());
    // now move the state to RUNNING
    vertex1.getCurrentExecutionAttempt().transitionState(ExecutionState.RUNNING);
    // the coordinator should start checkpointing now
    manuallyTriggeredScheduledExecutor.triggerPeriodicScheduledTasks();
    manuallyTriggeredScheduledExecutor.triggerAll();
    return checkpointCoordinator;
}
Also used : JobVertexID(org.apache.flink.runtime.jobgraph.JobVertexID) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ExecutionVertex(org.apache.flink.runtime.executiongraph.ExecutionVertex) CheckpointCoordinatorBuilder(org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)

Aggregations

CheckpointCoordinatorConfiguration (org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration)25 JobCheckpointingSettings (org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings)13 ExecutionGraph (org.apache.flink.runtime.executiongraph.ExecutionGraph)10 JobVertexID (org.apache.flink.runtime.jobgraph.JobVertexID)10 Test (org.junit.Test)10 CheckpointCoordinatorBuilder (org.apache.flink.runtime.checkpoint.CheckpointCoordinatorTestingUtils.CheckpointCoordinatorBuilder)9 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)8 ExecutionVertex (org.apache.flink.runtime.executiongraph.ExecutionVertex)7 ExecutionAttemptID (org.apache.flink.runtime.executiongraph.ExecutionAttemptID)6 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)6 AcknowledgeCheckpoint (org.apache.flink.runtime.messages.checkpoint.AcknowledgeCheckpoint)6 DeclineCheckpoint (org.apache.flink.runtime.messages.checkpoint.DeclineCheckpoint)4 IOException (java.io.IOException)3 JobID (org.apache.flink.api.common.JobID)3 ArrayList (java.util.ArrayList)2 CountDownLatch (java.util.concurrent.CountDownLatch)2 ExecutionException (java.util.concurrent.ExecutionException)2 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)2 Time (org.apache.flink.api.common.time.Time)2 MiniClusterClient (org.apache.flink.client.program.MiniClusterClient)2