Search in sources :

Example 1 with JobCheckpointingSettings

use of org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings in project flink by apache.

the class DispatcherCleanupITCase method createJobGraph.

private JobGraph createJobGraph() {
    final JobVertex firstVertex = new JobVertex("first");
    firstVertex.setInvokableClass(NoOpInvokable.class);
    firstVertex.setParallelism(1);
    final JobVertex secondVertex = new JobVertex("second");
    secondVertex.setInvokableClass(NoOpInvokable.class);
    secondVertex.setParallelism(1);
    final CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = CheckpointCoordinatorConfiguration.builder().setCheckpointInterval(20L).setMinPauseBetweenCheckpoints(20L).setCheckpointTimeout(10_000L).build();
    final JobCheckpointingSettings checkpointingSettings = new JobCheckpointingSettings(checkpointCoordinatorConfiguration, null);
    return JobGraphBuilder.newStreamingJobGraphBuilder().addJobVertex(firstVertex).addJobVertex(secondVertex).setJobCheckpointingSettings(checkpointingSettings).build();
}
Also used : JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration)

Example 2 with JobCheckpointingSettings

use of org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings in project flink by apache.

the class DefaultExecutionGraphBuilder method buildGraph.

public static DefaultExecutionGraph buildGraph(JobGraph jobGraph, Configuration jobManagerConfig, ScheduledExecutorService futureExecutor, Executor ioExecutor, ClassLoader classLoader, CompletedCheckpointStore completedCheckpointStore, CheckpointsCleaner checkpointsCleaner, CheckpointIDCounter checkpointIdCounter, Time rpcTimeout, BlobWriter blobWriter, Logger log, ShuffleMaster<?> shuffleMaster, JobMasterPartitionTracker partitionTracker, TaskDeploymentDescriptorFactory.PartitionLocationConstraint partitionLocationConstraint, ExecutionDeploymentListener executionDeploymentListener, ExecutionStateUpdateListener executionStateUpdateListener, long initializationTimestamp, VertexAttemptNumberStore vertexAttemptNumberStore, VertexParallelismStore vertexParallelismStore, Supplier<CheckpointStatsTracker> checkpointStatsTrackerFactory, boolean isDynamicGraph) throws JobExecutionException, JobException {
    checkNotNull(jobGraph, "job graph cannot be null");
    final String jobName = jobGraph.getName();
    final JobID jobId = jobGraph.getJobID();
    final JobInformation jobInformation = new JobInformation(jobId, jobName, jobGraph.getSerializedExecutionConfig(), jobGraph.getJobConfiguration(), jobGraph.getUserJarBlobKeys(), jobGraph.getClasspaths());
    final int maxPriorAttemptsHistoryLength = jobManagerConfig.getInteger(JobManagerOptions.MAX_ATTEMPTS_HISTORY_SIZE);
    final PartitionGroupReleaseStrategy.Factory partitionGroupReleaseStrategyFactory = PartitionGroupReleaseStrategyFactoryLoader.loadPartitionGroupReleaseStrategyFactory(jobManagerConfig);
    // create a new execution graph, if none exists so far
    final DefaultExecutionGraph executionGraph;
    try {
        executionGraph = new DefaultExecutionGraph(jobInformation, futureExecutor, ioExecutor, rpcTimeout, maxPriorAttemptsHistoryLength, classLoader, blobWriter, partitionGroupReleaseStrategyFactory, shuffleMaster, partitionTracker, partitionLocationConstraint, executionDeploymentListener, executionStateUpdateListener, initializationTimestamp, vertexAttemptNumberStore, vertexParallelismStore, isDynamicGraph);
    } catch (IOException e) {
        throw new JobException("Could not create the ExecutionGraph.", e);
    }
    try {
        executionGraph.setJsonPlan(JsonPlanGenerator.generatePlan(jobGraph));
    } catch (Throwable t) {
        log.warn("Cannot create JSON plan for job", t);
        // give the graph an empty plan
        executionGraph.setJsonPlan("{}");
    }
    // initialize the vertices that have a master initialization hook
    // file output formats create directories here, input formats create splits
    final long initMasterStart = System.nanoTime();
    log.info("Running initialization on master for job {} ({}).", jobName, jobId);
    for (JobVertex vertex : jobGraph.getVertices()) {
        String executableClass = vertex.getInvokableClassName();
        if (executableClass == null || executableClass.isEmpty()) {
            throw new JobSubmissionException(jobId, "The vertex " + vertex.getID() + " (" + vertex.getName() + ") has no invokable class.");
        }
        try {
            vertex.initializeOnMaster(classLoader);
        } catch (Throwable t) {
            throw new JobExecutionException(jobId, "Cannot initialize task '" + vertex.getName() + "': " + t.getMessage(), t);
        }
    }
    log.info("Successfully ran initialization on master in {} ms.", (System.nanoTime() - initMasterStart) / 1_000_000);
    // topologically sort the job vertices and attach the graph to the existing one
    List<JobVertex> sortedTopology = jobGraph.getVerticesSortedTopologicallyFromSources();
    if (log.isDebugEnabled()) {
        log.debug("Adding {} vertices from job graph {} ({}).", sortedTopology.size(), jobName, jobId);
    }
    executionGraph.attachJobGraph(sortedTopology);
    if (log.isDebugEnabled()) {
        log.debug("Successfully created execution graph from job graph {} ({}).", jobName, jobId);
    }
    // configure the state checkpointing
    if (isDynamicGraph) {
        // dynamic graph does not support checkpointing so we skip it
        log.warn("Skip setting up checkpointing for a job with dynamic graph.");
    } else if (isCheckpointingEnabled(jobGraph)) {
        JobCheckpointingSettings snapshotSettings = jobGraph.getCheckpointingSettings();
        // load the state backend from the application settings
        final StateBackend applicationConfiguredBackend;
        final SerializedValue<StateBackend> serializedAppConfigured = snapshotSettings.getDefaultStateBackend();
        if (serializedAppConfigured == null) {
            applicationConfiguredBackend = null;
        } else {
            try {
                applicationConfiguredBackend = serializedAppConfigured.deserializeValue(classLoader);
            } catch (IOException | ClassNotFoundException e) {
                throw new JobExecutionException(jobId, "Could not deserialize application-defined state backend.", e);
            }
        }
        final StateBackend rootBackend;
        try {
            rootBackend = StateBackendLoader.fromApplicationOrConfigOrDefault(applicationConfiguredBackend, snapshotSettings.isChangelogStateBackendEnabled(), jobManagerConfig, classLoader, log);
        } catch (IllegalConfigurationException | IOException | DynamicCodeLoadingException e) {
            throw new JobExecutionException(jobId, "Could not instantiate configured state backend", e);
        }
        // load the checkpoint storage from the application settings
        final CheckpointStorage applicationConfiguredStorage;
        final SerializedValue<CheckpointStorage> serializedAppConfiguredStorage = snapshotSettings.getDefaultCheckpointStorage();
        if (serializedAppConfiguredStorage == null) {
            applicationConfiguredStorage = null;
        } else {
            try {
                applicationConfiguredStorage = serializedAppConfiguredStorage.deserializeValue(classLoader);
            } catch (IOException | ClassNotFoundException e) {
                throw new JobExecutionException(jobId, "Could not deserialize application-defined checkpoint storage.", e);
            }
        }
        final CheckpointStorage rootStorage;
        try {
            rootStorage = CheckpointStorageLoader.load(applicationConfiguredStorage, null, rootBackend, jobManagerConfig, classLoader, log);
        } catch (IllegalConfigurationException | DynamicCodeLoadingException e) {
            throw new JobExecutionException(jobId, "Could not instantiate configured checkpoint storage", e);
        }
        // instantiate the user-defined checkpoint hooks
        final SerializedValue<MasterTriggerRestoreHook.Factory[]> serializedHooks = snapshotSettings.getMasterHooks();
        final List<MasterTriggerRestoreHook<?>> hooks;
        if (serializedHooks == null) {
            hooks = Collections.emptyList();
        } else {
            final MasterTriggerRestoreHook.Factory[] hookFactories;
            try {
                hookFactories = serializedHooks.deserializeValue(classLoader);
            } catch (IOException | ClassNotFoundException e) {
                throw new JobExecutionException(jobId, "Could not instantiate user-defined checkpoint hooks", e);
            }
            final Thread thread = Thread.currentThread();
            final ClassLoader originalClassLoader = thread.getContextClassLoader();
            thread.setContextClassLoader(classLoader);
            try {
                hooks = new ArrayList<>(hookFactories.length);
                for (MasterTriggerRestoreHook.Factory factory : hookFactories) {
                    hooks.add(MasterHooks.wrapHook(factory.create(), classLoader));
                }
            } finally {
                thread.setContextClassLoader(originalClassLoader);
            }
        }
        final CheckpointCoordinatorConfiguration chkConfig = snapshotSettings.getCheckpointCoordinatorConfiguration();
        executionGraph.enableCheckpointing(chkConfig, hooks, checkpointIdCounter, completedCheckpointStore, rootBackend, rootStorage, checkpointStatsTrackerFactory.get(), checkpointsCleaner);
    }
    return executionGraph;
}
Also used : ArrayList(java.util.ArrayList) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) TaskDeploymentDescriptorFactory(org.apache.flink.runtime.deployment.TaskDeploymentDescriptorFactory) JobSubmissionException(org.apache.flink.runtime.client.JobSubmissionException) StateBackend(org.apache.flink.runtime.state.StateBackend) MasterTriggerRestoreHook(org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook) JobException(org.apache.flink.runtime.JobException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) CheckpointStorage(org.apache.flink.runtime.state.CheckpointStorage) ArrayList(java.util.ArrayList) List(java.util.List) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) IOException(java.io.IOException) SerializedValue(org.apache.flink.util.SerializedValue) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) JobID(org.apache.flink.api.common.JobID) PartitionGroupReleaseStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.partitionrelease.PartitionGroupReleaseStrategy)

Example 3 with JobCheckpointingSettings

use of org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings in project flink by apache.

the class CheckpointSettingsSerializableTest method testDeserializationOfUserCodeWithUserClassLoader.

@Test
public void testDeserializationOfUserCodeWithUserClassLoader() throws Exception {
    final ClassLoaderUtils.ObjectAndClassLoader<Serializable> outsideClassLoading = ClassLoaderUtils.createSerializableObjectFromNewClassLoader();
    final ClassLoader classLoader = outsideClassLoading.getClassLoader();
    final Serializable outOfClassPath = outsideClassLoading.getObject();
    final MasterTriggerRestoreHook.Factory[] hooks = { new TestFactory(outOfClassPath) };
    final SerializedValue<MasterTriggerRestoreHook.Factory[]> serHooks = new SerializedValue<>(hooks);
    final JobCheckpointingSettings checkpointingSettings = new JobCheckpointingSettings(new CheckpointCoordinatorConfiguration(1000L, 10000L, 0L, 1, CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION, true, false, 0, 0), new SerializedValue<StateBackend>(new CustomStateBackend(outOfClassPath)), TernaryBoolean.UNDEFINED, new SerializedValue<CheckpointStorage>(new CustomCheckpointStorage(outOfClassPath)), serHooks);
    final JobGraph jobGraph = JobGraphBuilder.newStreamingJobGraphBuilder().setJobCheckpointingSettings(checkpointingSettings).build();
    // to serialize/deserialize the job graph to see if the behavior is correct under
    // distributed execution
    final JobGraph copy = CommonTestUtils.createCopySerializable(jobGraph);
    final ExecutionGraph eg = TestingDefaultExecutionGraphBuilder.newBuilder().setJobGraph(copy).setUserClassLoader(classLoader).build();
    assertEquals(1, eg.getCheckpointCoordinator().getNumberOfRegisteredMasterHooks());
    assertTrue(jobGraph.getCheckpointingSettings().getDefaultStateBackend().deserializeValue(classLoader) instanceof CustomStateBackend);
}
Also used : Serializable(java.io.Serializable) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) SerializedValue(org.apache.flink.util.SerializedValue) StateBackend(org.apache.flink.runtime.state.StateBackend) OperatorStateBackend(org.apache.flink.runtime.state.OperatorStateBackend) AbstractKeyedStateBackend(org.apache.flink.runtime.state.AbstractKeyedStateBackend) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) ClassLoaderUtils(org.apache.flink.testutils.ClassLoaderUtils) ExecutionGraph(org.apache.flink.runtime.executiongraph.ExecutionGraph) CheckpointStorage(org.apache.flink.runtime.state.CheckpointStorage) Test(org.junit.Test)

Example 4 with JobCheckpointingSettings

use of org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings in project flink by apache.

the class DefaultExecutionGraphDeploymentTest method createExecutionGraph.

private ExecutionGraph createExecutionGraph(Configuration configuration) throws Exception {
    final JobGraph jobGraph = JobGraphTestUtils.emptyJobGraph();
    jobGraph.setSnapshotSettings(new JobCheckpointingSettings(new CheckpointCoordinatorConfiguration(100, 10 * 60 * 1000, 0, 1, CheckpointRetentionPolicy.NEVER_RETAIN_AFTER_TERMINATION, false, false, 0, 0), null));
    return TestingDefaultExecutionGraphBuilder.newBuilder().setJobGraph(jobGraph).setJobMasterConfig(configuration).setBlobWriter(blobWriter).build();
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration)

Example 5 with JobCheckpointingSettings

use of org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings in project flink by apache.

the class JobDispatcherITCase method generateAndPersistJobGraph.

private JobID generateAndPersistJobGraph(Configuration configuration, long checkpointInterval, Path tmpPath) throws Exception {
    final JobVertex jobVertex = new JobVertex("jobVertex");
    jobVertex.setInvokableClass(AtLeastOneCheckpointInvokable.class);
    jobVertex.setParallelism(1);
    final CheckpointCoordinatorConfiguration checkpointCoordinatorConfiguration = CheckpointCoordinatorConfiguration.builder().setCheckpointInterval(checkpointInterval).build();
    final JobCheckpointingSettings checkpointingSettings = new JobCheckpointingSettings(checkpointCoordinatorConfiguration, null);
    final JobGraph jobGraph = JobGraphBuilder.newStreamingJobGraphBuilder().addJobVertex(jobVertex).setJobCheckpointingSettings(checkpointingSettings).build();
    final Path jobGraphPath = tmpPath.resolve(JOB_GRAPH_FILE_PATH.defaultValue());
    try (ObjectOutputStream objectOutputStream = new ObjectOutputStream(Files.newOutputStream(jobGraphPath, CREATE))) {
        objectOutputStream.writeObject(jobGraph);
    }
    configuration.setString(JOB_GRAPH_FILE_PATH.key(), jobGraphPath.toString());
    return jobGraph.getJobID();
}
Also used : Path(java.nio.file.Path) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) ObjectOutputStream(java.io.ObjectOutputStream)

Aggregations

JobCheckpointingSettings (org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings)20 CheckpointCoordinatorConfiguration (org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration)15 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)11 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)11 Test (org.junit.Test)7 IOException (java.io.IOException)5 JobID (org.apache.flink.api.common.JobID)4 CheckpointStorage (org.apache.flink.runtime.state.CheckpointStorage)4 StateBackend (org.apache.flink.runtime.state.StateBackend)4 Duration (java.time.Duration)3 ArrayList (java.util.ArrayList)3 List (java.util.List)3 CompletableFuture (java.util.concurrent.CompletableFuture)3 ExecutionConfig (org.apache.flink.api.common.ExecutionConfig)3 JobStatus (org.apache.flink.api.common.JobStatus)3 Configuration (org.apache.flink.configuration.Configuration)3 ArrayBlockingQueue (java.util.concurrent.ArrayBlockingQueue)2 BlockingQueue (java.util.concurrent.BlockingQueue)2 CountDownLatch (java.util.concurrent.CountDownLatch)2 TimeUnit (java.util.concurrent.TimeUnit)2