use of org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings in project flink by apache.
the class StreamingJobGraphGenerator method configureCheckpointing.
private void configureCheckpointing() {
CheckpointConfig cfg = streamGraph.getCheckpointConfig();
long interval = cfg.getCheckpointInterval();
if (interval > 0) {
// check if a restart strategy has been set, if not then set the FixedDelayRestartStrategy
if (streamGraph.getExecutionConfig().getRestartStrategy() == null) {
// if the user enabled checkpointing, the default number of exec retries is infinite.
streamGraph.getExecutionConfig().setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, DEFAULT_RESTART_DELAY));
}
} else {
// interval of max value means disable periodic checkpoint
interval = Long.MAX_VALUE;
}
// collect the vertices that receive "trigger checkpoint" messages.
// currently, these are all the sources
List<JobVertexID> triggerVertices = new ArrayList<>();
// collect the vertices that need to acknowledge the checkpoint
// currently, these are all vertices
List<JobVertexID> ackVertices = new ArrayList<>(jobVertices.size());
// collect the vertices that receive "commit checkpoint" messages
// currently, these are all vertices
List<JobVertexID> commitVertices = new ArrayList<>();
for (JobVertex vertex : jobVertices.values()) {
if (vertex.isInputVertex()) {
triggerVertices.add(vertex.getID());
}
commitVertices.add(vertex.getID());
ackVertices.add(vertex.getID());
}
ExternalizedCheckpointSettings externalizedCheckpointSettings;
if (cfg.isExternalizedCheckpointsEnabled()) {
CheckpointConfig.ExternalizedCheckpointCleanup cleanup = cfg.getExternalizedCheckpointCleanup();
// Sanity check
if (cleanup == null) {
throw new IllegalStateException("Externalized checkpoints enabled, but no cleanup mode configured.");
}
externalizedCheckpointSettings = ExternalizedCheckpointSettings.externalizeCheckpoints(cleanup.deleteOnCancellation());
} else {
externalizedCheckpointSettings = ExternalizedCheckpointSettings.none();
}
CheckpointingMode mode = cfg.getCheckpointingMode();
boolean isExactlyOnce;
if (mode == CheckpointingMode.EXACTLY_ONCE) {
isExactlyOnce = true;
} else if (mode == CheckpointingMode.AT_LEAST_ONCE) {
isExactlyOnce = false;
} else {
throw new IllegalStateException("Unexpected checkpointing mode. " + "Did not expect there to be another checkpointing mode besides " + "exactly-once or at-least-once.");
}
JobSnapshottingSettings settings = new JobSnapshottingSettings(triggerVertices, ackVertices, commitVertices, interval, cfg.getCheckpointTimeout(), cfg.getMinPauseBetweenCheckpoints(), cfg.getMaxConcurrentCheckpoints(), externalizedCheckpointSettings, streamGraph.getStateBackend(), isExactlyOnce);
jobGraph.setSnapshotSettings(settings);
}
use of org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings in project flink by apache.
the class StreamingJobGraphGeneratorTest method testDisabledCheckpointing.
/**
* Tests that disabled checkpointing sets the checkpointing interval to Long.MAX_VALUE.
*/
@Test
public void testDisabledCheckpointing() throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamGraph streamGraph = new StreamGraph(env, 1);
assertFalse("Checkpointing enabled", streamGraph.getCheckpointConfig().isCheckpointingEnabled());
StreamingJobGraphGenerator jobGraphGenerator = new StreamingJobGraphGenerator(streamGraph, 1);
JobGraph jobGraph = jobGraphGenerator.createJobGraph();
JobSnapshottingSettings snapshottingSettings = jobGraph.getSnapshotSettings();
assertEquals(Long.MAX_VALUE, snapshottingSettings.getCheckpointInterval());
}
use of org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings in project flink by apache.
the class ExecutionGraphDeploymentTest method createExecutionGraph.
private ExecutionGraph createExecutionGraph(Configuration configuration) throws Exception {
final ScheduledExecutorService executor = TestingUtils.defaultExecutor();
final JobID jobId = new JobID();
final JobGraph jobGraph = new JobGraph(jobId, "test");
jobGraph.setSnapshotSettings(new JobSnapshottingSettings(Collections.<JobVertexID>emptyList(), Collections.<JobVertexID>emptyList(), Collections.<JobVertexID>emptyList(), 100, 10 * 60 * 1000, 0, 1, ExternalizedCheckpointSettings.none(), null, false));
return ExecutionGraphBuilder.buildGraph(null, jobGraph, configuration, executor, executor, new ProgrammedSlotProvider(1), getClass().getClassLoader(), new StandaloneCheckpointRecoveryFactory(), Time.seconds(10), new NoRestartStrategy(), new UnregisteredMetricsGroup(), 1, LoggerFactory.getLogger(getClass()));
}
use of org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings in project flink by apache.
the class JobManagerTest method testSavepointRestoreSettings.
/**
* Tests that configured {@link SavepointRestoreSettings} are respected.
*/
@Test
public void testSavepointRestoreSettings() throws Exception {
FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
ActorSystem actorSystem = null;
ActorGateway jobManager = null;
ActorGateway archiver = null;
ActorGateway taskManager = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
Tuple2<ActorRef, ActorRef> master = JobManager.startJobManagerActors(new Configuration(), actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), Option.apply("jm"), Option.apply("arch"), TestingJobManager.class, TestingMemoryArchivist.class);
jobManager = new AkkaActorGateway(master._1(), null);
archiver = new AkkaActorGateway(master._2(), null);
Configuration tmConfig = new Configuration();
tmConfig.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, 4);
ActorRef taskManagerRef = TaskManager.startTaskManagerComponentsAndActor(tmConfig, ResourceID.generate(), actorSystem, "localhost", Option.apply("tm"), Option.<LeaderRetrievalService>apply(new StandaloneLeaderRetrievalService(jobManager.path())), true, TestingTaskManager.class);
taskManager = new AkkaActorGateway(taskManagerRef, null);
// Wait until connected
Object msg = new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager.actor());
Await.ready(taskManager.ask(msg, timeout), timeout);
// Create job graph
JobVertex sourceVertex = new JobVertex("Source");
sourceVertex.setInvokableClass(BlockingStatefulInvokable.class);
sourceVertex.setParallelism(1);
JobGraph jobGraph = new JobGraph("TestingJob", sourceVertex);
JobSnapshottingSettings snapshottingSettings = new JobSnapshottingSettings(Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), // deactivated checkpointing
Long.MAX_VALUE, 360000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true);
jobGraph.setSnapshotSettings(snapshottingSettings);
// Submit job graph
msg = new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED);
Await.result(jobManager.ask(msg, timeout), timeout);
// Wait for all tasks to be running
msg = new TestingJobManagerMessages.WaitForAllVerticesToBeRunning(jobGraph.getJobID());
Await.result(jobManager.ask(msg, timeout), timeout);
// Trigger savepoint
File targetDirectory = tmpFolder.newFolder();
msg = new TriggerSavepoint(jobGraph.getJobID(), Option.apply(targetDirectory.getAbsolutePath()));
Future<Object> future = jobManager.ask(msg, timeout);
Object result = Await.result(future, timeout);
String savepointPath = ((TriggerSavepointSuccess) result).savepointPath();
// Cancel because of restarts
msg = new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID());
Future<?> removedFuture = jobManager.ask(msg, timeout);
Future<?> cancelFuture = jobManager.ask(new CancelJob(jobGraph.getJobID()), timeout);
Object response = Await.result(cancelFuture, timeout);
assertTrue("Unexpected response: " + response, response instanceof CancellationSuccess);
Await.ready(removedFuture, timeout);
// Adjust the job (we need a new operator ID)
JobVertex newSourceVertex = new JobVertex("NewSource");
newSourceVertex.setInvokableClass(BlockingStatefulInvokable.class);
newSourceVertex.setParallelism(1);
JobGraph newJobGraph = new JobGraph("NewTestingJob", newSourceVertex);
JobSnapshottingSettings newSnapshottingSettings = new JobSnapshottingSettings(Collections.singletonList(newSourceVertex.getID()), Collections.singletonList(newSourceVertex.getID()), Collections.singletonList(newSourceVertex.getID()), // deactivated checkpointing
Long.MAX_VALUE, 360000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true);
newJobGraph.setSnapshotSettings(newSnapshottingSettings);
SavepointRestoreSettings restoreSettings = SavepointRestoreSettings.forPath(savepointPath, false);
newJobGraph.setSavepointRestoreSettings(restoreSettings);
msg = new JobManagerMessages.SubmitJob(newJobGraph, ListeningBehaviour.DETACHED);
response = Await.result(jobManager.ask(msg, timeout), timeout);
assertTrue("Unexpected response: " + response, response instanceof JobManagerMessages.JobResultFailure);
JobManagerMessages.JobResultFailure failure = (JobManagerMessages.JobResultFailure) response;
Throwable cause = failure.cause().deserializeError(ClassLoader.getSystemClassLoader());
assertTrue(cause instanceof IllegalStateException);
assertTrue(cause.getMessage().contains("allowNonRestoredState"));
// Wait until removed
msg = new TestingJobManagerMessages.NotifyWhenJobRemoved(newJobGraph.getJobID());
Await.ready(jobManager.ask(msg, timeout), timeout);
// Resubmit, but allow non restored state now
restoreSettings = SavepointRestoreSettings.forPath(savepointPath, true);
newJobGraph.setSavepointRestoreSettings(restoreSettings);
msg = new JobManagerMessages.SubmitJob(newJobGraph, ListeningBehaviour.DETACHED);
response = Await.result(jobManager.ask(msg, timeout), timeout);
assertTrue("Unexpected response: " + response, response instanceof JobManagerMessages.JobSubmitSuccess);
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
if (archiver != null) {
archiver.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (jobManager != null) {
jobManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (taskManager != null) {
taskManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
use of org.apache.flink.runtime.jobgraph.tasks.JobSnapshottingSettings in project flink by apache.
the class JobManagerTest method testCancelWithSavepoint.
@Test
public void testCancelWithSavepoint() throws Exception {
File defaultSavepointDir = tmpFolder.newFolder();
FiniteDuration timeout = new FiniteDuration(30, TimeUnit.SECONDS);
Configuration config = new Configuration();
config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, defaultSavepointDir.getAbsolutePath());
ActorSystem actorSystem = null;
ActorGateway jobManager = null;
ActorGateway archiver = null;
ActorGateway taskManager = null;
try {
actorSystem = AkkaUtils.createLocalActorSystem(new Configuration());
Tuple2<ActorRef, ActorRef> master = JobManager.startJobManagerActors(config, actorSystem, TestingUtils.defaultExecutor(), TestingUtils.defaultExecutor(), Option.apply("jm"), Option.apply("arch"), TestingJobManager.class, TestingMemoryArchivist.class);
jobManager = new AkkaActorGateway(master._1(), null);
archiver = new AkkaActorGateway(master._2(), null);
ActorRef taskManagerRef = TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), actorSystem, "localhost", Option.apply("tm"), Option.<LeaderRetrievalService>apply(new StandaloneLeaderRetrievalService(jobManager.path())), true, TestingTaskManager.class);
taskManager = new AkkaActorGateway(taskManagerRef, null);
// Wait until connected
Object msg = new TestingTaskManagerMessages.NotifyWhenRegisteredAtJobManager(jobManager.actor());
Await.ready(taskManager.ask(msg, timeout), timeout);
// Create job graph
JobVertex sourceVertex = new JobVertex("Source");
sourceVertex.setInvokableClass(BlockingStatefulInvokable.class);
sourceVertex.setParallelism(1);
JobGraph jobGraph = new JobGraph("TestingJob", sourceVertex);
JobSnapshottingSettings snapshottingSettings = new JobSnapshottingSettings(Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), Collections.singletonList(sourceVertex.getID()), 3600000, 3600000, 0, Integer.MAX_VALUE, ExternalizedCheckpointSettings.none(), null, true);
jobGraph.setSnapshotSettings(snapshottingSettings);
// Submit job graph
msg = new JobManagerMessages.SubmitJob(jobGraph, ListeningBehaviour.DETACHED);
Await.result(jobManager.ask(msg, timeout), timeout);
// Wait for all tasks to be running
msg = new TestingJobManagerMessages.WaitForAllVerticesToBeRunning(jobGraph.getJobID());
Await.result(jobManager.ask(msg, timeout), timeout);
// Notify when canelled
msg = new NotifyWhenJobStatus(jobGraph.getJobID(), JobStatus.CANCELED);
Future<Object> cancelled = jobManager.ask(msg, timeout);
// Cancel with savepoint
String savepointPath = null;
for (int i = 0; i < 10; i++) {
msg = new JobManagerMessages.CancelJobWithSavepoint(jobGraph.getJobID(), null);
CancellationResponse cancelResp = (CancellationResponse) Await.result(jobManager.ask(msg, timeout), timeout);
if (cancelResp instanceof CancellationFailure) {
CancellationFailure failure = (CancellationFailure) cancelResp;
if (failure.cause().getMessage().contains(CheckpointDeclineReason.NOT_ALL_REQUIRED_TASKS_RUNNING.message())) {
// wait and retry
Thread.sleep(200);
} else {
failure.cause().printStackTrace();
fail("Failed to cancel job: " + failure.cause().getMessage());
}
} else {
savepointPath = ((CancellationSuccess) cancelResp).savepointPath();
break;
}
}
// Verify savepoint path
assertNotEquals("Savepoint not triggered", null, savepointPath);
// Wait for job status change
Await.ready(cancelled, timeout);
File savepointFile = new File(savepointPath);
assertEquals(true, savepointFile.exists());
} finally {
if (actorSystem != null) {
actorSystem.shutdown();
}
if (archiver != null) {
archiver.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (jobManager != null) {
jobManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
if (taskManager != null) {
taskManager.actor().tell(PoisonPill.getInstance(), ActorRef.noSender());
}
}
}
Aggregations