Search in sources :

Example 1 with MasterTriggerRestoreHook

use of org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook in project flink by apache.

the class DefaultExecutionGraphBuilder method buildGraph.

public static DefaultExecutionGraph buildGraph(JobGraph jobGraph, Configuration jobManagerConfig, ScheduledExecutorService futureExecutor, Executor ioExecutor, ClassLoader classLoader, CompletedCheckpointStore completedCheckpointStore, CheckpointsCleaner checkpointsCleaner, CheckpointIDCounter checkpointIdCounter, Time rpcTimeout, BlobWriter blobWriter, Logger log, ShuffleMaster<?> shuffleMaster, JobMasterPartitionTracker partitionTracker, TaskDeploymentDescriptorFactory.PartitionLocationConstraint partitionLocationConstraint, ExecutionDeploymentListener executionDeploymentListener, ExecutionStateUpdateListener executionStateUpdateListener, long initializationTimestamp, VertexAttemptNumberStore vertexAttemptNumberStore, VertexParallelismStore vertexParallelismStore, Supplier<CheckpointStatsTracker> checkpointStatsTrackerFactory, boolean isDynamicGraph) throws JobExecutionException, JobException {
    checkNotNull(jobGraph, "job graph cannot be null");
    final String jobName = jobGraph.getName();
    final JobID jobId = jobGraph.getJobID();
    final JobInformation jobInformation = new JobInformation(jobId, jobName, jobGraph.getSerializedExecutionConfig(), jobGraph.getJobConfiguration(), jobGraph.getUserJarBlobKeys(), jobGraph.getClasspaths());
    final int maxPriorAttemptsHistoryLength = jobManagerConfig.getInteger(JobManagerOptions.MAX_ATTEMPTS_HISTORY_SIZE);
    final PartitionGroupReleaseStrategy.Factory partitionGroupReleaseStrategyFactory = PartitionGroupReleaseStrategyFactoryLoader.loadPartitionGroupReleaseStrategyFactory(jobManagerConfig);
    // create a new execution graph, if none exists so far
    final DefaultExecutionGraph executionGraph;
    try {
        executionGraph = new DefaultExecutionGraph(jobInformation, futureExecutor, ioExecutor, rpcTimeout, maxPriorAttemptsHistoryLength, classLoader, blobWriter, partitionGroupReleaseStrategyFactory, shuffleMaster, partitionTracker, partitionLocationConstraint, executionDeploymentListener, executionStateUpdateListener, initializationTimestamp, vertexAttemptNumberStore, vertexParallelismStore, isDynamicGraph);
    } catch (IOException e) {
        throw new JobException("Could not create the ExecutionGraph.", e);
    }
    try {
        executionGraph.setJsonPlan(JsonPlanGenerator.generatePlan(jobGraph));
    } catch (Throwable t) {
        log.warn("Cannot create JSON plan for job", t);
        // give the graph an empty plan
        executionGraph.setJsonPlan("{}");
    }
    // initialize the vertices that have a master initialization hook
    // file output formats create directories here, input formats create splits
    final long initMasterStart = System.nanoTime();
    log.info("Running initialization on master for job {} ({}).", jobName, jobId);
    for (JobVertex vertex : jobGraph.getVertices()) {
        String executableClass = vertex.getInvokableClassName();
        if (executableClass == null || executableClass.isEmpty()) {
            throw new JobSubmissionException(jobId, "The vertex " + vertex.getID() + " (" + vertex.getName() + ") has no invokable class.");
        }
        try {
            vertex.initializeOnMaster(classLoader);
        } catch (Throwable t) {
            throw new JobExecutionException(jobId, "Cannot initialize task '" + vertex.getName() + "': " + t.getMessage(), t);
        }
    }
    log.info("Successfully ran initialization on master in {} ms.", (System.nanoTime() - initMasterStart) / 1_000_000);
    // topologically sort the job vertices and attach the graph to the existing one
    List<JobVertex> sortedTopology = jobGraph.getVerticesSortedTopologicallyFromSources();
    if (log.isDebugEnabled()) {
        log.debug("Adding {} vertices from job graph {} ({}).", sortedTopology.size(), jobName, jobId);
    }
    executionGraph.attachJobGraph(sortedTopology);
    if (log.isDebugEnabled()) {
        log.debug("Successfully created execution graph from job graph {} ({}).", jobName, jobId);
    }
    // configure the state checkpointing
    if (isDynamicGraph) {
        // dynamic graph does not support checkpointing so we skip it
        log.warn("Skip setting up checkpointing for a job with dynamic graph.");
    } else if (isCheckpointingEnabled(jobGraph)) {
        JobCheckpointingSettings snapshotSettings = jobGraph.getCheckpointingSettings();
        // load the state backend from the application settings
        final StateBackend applicationConfiguredBackend;
        final SerializedValue<StateBackend> serializedAppConfigured = snapshotSettings.getDefaultStateBackend();
        if (serializedAppConfigured == null) {
            applicationConfiguredBackend = null;
        } else {
            try {
                applicationConfiguredBackend = serializedAppConfigured.deserializeValue(classLoader);
            } catch (IOException | ClassNotFoundException e) {
                throw new JobExecutionException(jobId, "Could not deserialize application-defined state backend.", e);
            }
        }
        final StateBackend rootBackend;
        try {
            rootBackend = StateBackendLoader.fromApplicationOrConfigOrDefault(applicationConfiguredBackend, snapshotSettings.isChangelogStateBackendEnabled(), jobManagerConfig, classLoader, log);
        } catch (IllegalConfigurationException | IOException | DynamicCodeLoadingException e) {
            throw new JobExecutionException(jobId, "Could not instantiate configured state backend", e);
        }
        // load the checkpoint storage from the application settings
        final CheckpointStorage applicationConfiguredStorage;
        final SerializedValue<CheckpointStorage> serializedAppConfiguredStorage = snapshotSettings.getDefaultCheckpointStorage();
        if (serializedAppConfiguredStorage == null) {
            applicationConfiguredStorage = null;
        } else {
            try {
                applicationConfiguredStorage = serializedAppConfiguredStorage.deserializeValue(classLoader);
            } catch (IOException | ClassNotFoundException e) {
                throw new JobExecutionException(jobId, "Could not deserialize application-defined checkpoint storage.", e);
            }
        }
        final CheckpointStorage rootStorage;
        try {
            rootStorage = CheckpointStorageLoader.load(applicationConfiguredStorage, null, rootBackend, jobManagerConfig, classLoader, log);
        } catch (IllegalConfigurationException | DynamicCodeLoadingException e) {
            throw new JobExecutionException(jobId, "Could not instantiate configured checkpoint storage", e);
        }
        // instantiate the user-defined checkpoint hooks
        final SerializedValue<MasterTriggerRestoreHook.Factory[]> serializedHooks = snapshotSettings.getMasterHooks();
        final List<MasterTriggerRestoreHook<?>> hooks;
        if (serializedHooks == null) {
            hooks = Collections.emptyList();
        } else {
            final MasterTriggerRestoreHook.Factory[] hookFactories;
            try {
                hookFactories = serializedHooks.deserializeValue(classLoader);
            } catch (IOException | ClassNotFoundException e) {
                throw new JobExecutionException(jobId, "Could not instantiate user-defined checkpoint hooks", e);
            }
            final Thread thread = Thread.currentThread();
            final ClassLoader originalClassLoader = thread.getContextClassLoader();
            thread.setContextClassLoader(classLoader);
            try {
                hooks = new ArrayList<>(hookFactories.length);
                for (MasterTriggerRestoreHook.Factory factory : hookFactories) {
                    hooks.add(MasterHooks.wrapHook(factory.create(), classLoader));
                }
            } finally {
                thread.setContextClassLoader(originalClassLoader);
            }
        }
        final CheckpointCoordinatorConfiguration chkConfig = snapshotSettings.getCheckpointCoordinatorConfiguration();
        executionGraph.enableCheckpointing(chkConfig, hooks, checkpointIdCounter, completedCheckpointStore, rootBackend, rootStorage, checkpointStatsTrackerFactory.get(), checkpointsCleaner);
    }
    return executionGraph;
}
Also used : ArrayList(java.util.ArrayList) JobCheckpointingSettings(org.apache.flink.runtime.jobgraph.tasks.JobCheckpointingSettings) TaskDeploymentDescriptorFactory(org.apache.flink.runtime.deployment.TaskDeploymentDescriptorFactory) JobSubmissionException(org.apache.flink.runtime.client.JobSubmissionException) StateBackend(org.apache.flink.runtime.state.StateBackend) MasterTriggerRestoreHook(org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook) JobException(org.apache.flink.runtime.JobException) JobExecutionException(org.apache.flink.runtime.client.JobExecutionException) CheckpointStorage(org.apache.flink.runtime.state.CheckpointStorage) ArrayList(java.util.ArrayList) List(java.util.List) CheckpointCoordinatorConfiguration(org.apache.flink.runtime.jobgraph.tasks.CheckpointCoordinatorConfiguration) IOException(java.io.IOException) SerializedValue(org.apache.flink.util.SerializedValue) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) JobID(org.apache.flink.api.common.JobID) PartitionGroupReleaseStrategy(org.apache.flink.runtime.executiongraph.failover.flip1.partitionrelease.PartitionGroupReleaseStrategy)

Example 2 with MasterTriggerRestoreHook

use of org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook in project flink by apache.

the class MasterHooks method deserializeState.

private static <T> T deserializeState(MasterState state, MasterTriggerRestoreHook<?> hook) throws FlinkException {
    @SuppressWarnings("unchecked") final MasterTriggerRestoreHook<T> typedHook = (MasterTriggerRestoreHook<T>) hook;
    final String id = hook.getIdentifier();
    try {
        final SimpleVersionedSerializer<T> deserializer = typedHook.createCheckpointDataSerializer();
        if (deserializer == null) {
            throw new FlinkException("null serializer for state of hook " + hook.getIdentifier());
        }
        return deserializer.deserialize(state.version(), state.bytes());
    } catch (Throwable t) {
        throw new FlinkException("Cannot deserialize state for master hook '" + id + '\'', t);
    }
}
Also used : MasterTriggerRestoreHook(org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook) FlinkException(org.apache.flink.util.FlinkException)

Example 3 with MasterTriggerRestoreHook

use of org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook in project flink by apache.

the class MasterHooks method triggerHook.

// ------------------------------------------------------------------------
// checkpoint triggering
// ------------------------------------------------------------------------
/**
 * Trigger master hook and return a completable future with state.
 *
 * @param hook The master hook given
 * @param checkpointId The checkpoint ID of the triggering checkpoint
 * @param timestamp The (informational) timestamp for the triggering checkpoint
 * @param executor An executor that can be used for asynchronous I/O calls
 * @param <T> The type of data produced by the hook
 * @return the completable future with state
 */
public static <T> CompletableFuture<MasterState> triggerHook(MasterTriggerRestoreHook<T> hook, long checkpointId, long timestamp, Executor executor) {
    final String id = hook.getIdentifier();
    final SimpleVersionedSerializer<T> serializer = hook.createCheckpointDataSerializer();
    try {
        // call the hook!
        final CompletableFuture<T> resultFuture = hook.triggerCheckpoint(checkpointId, timestamp, executor);
        if (resultFuture == null) {
            return CompletableFuture.completedFuture(null);
        }
        return resultFuture.thenApply(result -> {
            // if the result of the future is not null, return it as state
            if (result == null) {
                return null;
            } else if (serializer != null) {
                try {
                    final int version = serializer.getVersion();
                    final byte[] bytes = serializer.serialize(result);
                    return new MasterState(id, bytes, version);
                } catch (Throwable t) {
                    ExceptionUtils.rethrowIfFatalErrorOrOOM(t);
                    throw new CompletionException(new FlinkException("Failed to serialize state of master hook '" + id + '\'', t));
                }
            } else {
                throw new CompletionException(new FlinkException("Checkpoint hook '" + id + " is stateful but creates no serializer"));
            }
        }).exceptionally((throwable) -> {
            throw new CompletionException(new FlinkException("Checkpoint master hook '" + id + "' produced an exception", throwable.getCause()));
        });
    } catch (Throwable t) {
        return FutureUtils.completedExceptionally(new FlinkException("Error while triggering checkpoint master hook '" + id + '\'', t));
    }
}
Also used : MasterState(org.apache.flink.runtime.checkpoint.MasterState) FlinkException(org.apache.flink.util.FlinkException) Logger(org.slf4j.Logger) Tuple2(org.apache.flink.api.java.tuple.Tuple2) Executor(java.util.concurrent.Executor) Collection(java.util.Collection) ExceptionUtils(org.apache.flink.util.ExceptionUtils) CompletableFuture(java.util.concurrent.CompletableFuture) CompletionException(java.util.concurrent.CompletionException) Preconditions(org.apache.flink.util.Preconditions) ArrayList(java.util.ArrayList) MasterTriggerRestoreHook(org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook) LinkedHashMap(java.util.LinkedHashMap) LambdaUtil(org.apache.flink.util.LambdaUtil) FutureUtils(org.apache.flink.util.concurrent.FutureUtils) SimpleVersionedSerializer(org.apache.flink.core.io.SimpleVersionedSerializer) Map(java.util.Map) Nullable(javax.annotation.Nullable) MasterState(org.apache.flink.runtime.checkpoint.MasterState) CompletionException(java.util.concurrent.CompletionException) FlinkException(org.apache.flink.util.FlinkException)

Example 4 with MasterTriggerRestoreHook

use of org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook in project flink by apache.

the class MasterHooks method restoreHook.

private static <T> void restoreHook(final Object state, final MasterTriggerRestoreHook<?> hook, final long checkpointId) throws FlinkException {
    @SuppressWarnings("unchecked") final T typedState = (T) state;
    @SuppressWarnings("unchecked") final MasterTriggerRestoreHook<T> typedHook = (MasterTriggerRestoreHook<T>) hook;
    try {
        typedHook.restoreCheckpoint(checkpointId, typedState);
    } catch (FlinkException e) {
        throw e;
    } catch (Throwable t) {
        // catch all here, including Errors that may come from dependency and classpath issues
        ExceptionUtils.rethrowIfFatalError(t);
        throw new FlinkException("Error while calling restoreCheckpoint on checkpoint hook '" + hook.getIdentifier() + '\'', t);
    }
}
Also used : MasterTriggerRestoreHook(org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook) FlinkException(org.apache.flink.util.FlinkException)

Example 5 with MasterTriggerRestoreHook

use of org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook in project flink by apache.

the class MasterHooks method restoreMasterHooks.

// ------------------------------------------------------------------------
// checkpoint restoring
// ------------------------------------------------------------------------
/**
 * Calls the restore method given checkpoint master hooks and passes the given master state to
 * them where state with a matching name is found.
 *
 * <p>If state is found and no hook with the same name is found, the method throws an exception,
 * unless the {@code allowUnmatchedState} flag is set.
 *
 * @param masterHooks The hooks to call restore on
 * @param states The state to pass to the hooks
 * @param checkpointId The checkpoint ID of the restored checkpoint
 * @param allowUnmatchedState If true, the method fails if not all states are picked up by a
 *     hook.
 * @param log The logger for log messages
 * @throws FlinkException Thrown, if the hooks throw an exception, or the state+ deserialization
 *     fails.
 */
public static void restoreMasterHooks(final Map<String, MasterTriggerRestoreHook<?>> masterHooks, final Collection<MasterState> states, final long checkpointId, final boolean allowUnmatchedState, final Logger log) throws FlinkException {
    // early out
    if (states == null || states.isEmpty() || masterHooks == null || masterHooks.isEmpty()) {
        log.info("No master state to restore");
        return;
    }
    log.info("Calling master restore hooks");
    // collect the hooks
    final LinkedHashMap<String, MasterTriggerRestoreHook<?>> allHooks = new LinkedHashMap<>(masterHooks);
    // first, deserialize all hook state
    final ArrayList<Tuple2<MasterTriggerRestoreHook<?>, Object>> hooksAndStates = new ArrayList<>();
    for (MasterState state : states) {
        if (state != null) {
            final String name = state.name();
            final MasterTriggerRestoreHook<?> hook = allHooks.remove(name);
            if (hook != null) {
                log.debug("Found state to restore for hook '{}'", name);
                Object deserializedState = deserializeState(state, hook);
                hooksAndStates.add(new Tuple2<>(hook, deserializedState));
            } else if (!allowUnmatchedState) {
                throw new IllegalStateException("Found state '" + state.name() + "' which is not resumed by any hook.");
            } else {
                log.info("Dropping unmatched state from '{}'", name);
            }
        }
    }
    // now that all is deserialized, call the hooks
    for (Tuple2<MasterTriggerRestoreHook<?>, Object> hookAndState : hooksAndStates) {
        restoreHook(hookAndState.f1, hookAndState.f0, checkpointId);
    }
    // trigger the remaining hooks without checkpointed state
    for (MasterTriggerRestoreHook<?> hook : allHooks.values()) {
        restoreHook(null, hook, checkpointId);
    }
}
Also used : MasterState(org.apache.flink.runtime.checkpoint.MasterState) ArrayList(java.util.ArrayList) MasterTriggerRestoreHook(org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook) LinkedHashMap(java.util.LinkedHashMap) Tuple2(org.apache.flink.api.java.tuple.Tuple2)

Aggregations

MasterTriggerRestoreHook (org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook)6 ArrayList (java.util.ArrayList)3 FlinkException (org.apache.flink.util.FlinkException)3 LinkedHashMap (java.util.LinkedHashMap)2 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)2 MasterState (org.apache.flink.runtime.checkpoint.MasterState)2 IOException (java.io.IOException)1 Collection (java.util.Collection)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Map (java.util.Map)1 CompletableFuture (java.util.concurrent.CompletableFuture)1 CompletionException (java.util.concurrent.CompletionException)1 Executor (java.util.concurrent.Executor)1 Nullable (javax.annotation.Nullable)1 JobID (org.apache.flink.api.common.JobID)1 SimpleVersionedSerializer (org.apache.flink.core.io.SimpleVersionedSerializer)1 JobException (org.apache.flink.runtime.JobException)1 Factory (org.apache.flink.runtime.checkpoint.MasterTriggerRestoreHook.Factory)1 JobExecutionException (org.apache.flink.runtime.client.JobExecutionException)1