use of scala.concurrent.duration.Deadline in project flink by apache.
the class RescalingITCase method testSavepointRescalingNonPartitionedStateCausesException.
* Tests that a job cannot be restarted from a savepoint with a different parallelism if the
* rescaled operator has non-partitioned state.
* @throws Exception
public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
final int parallelism = numSlots / 2;
final int parallelism2 = numSlots;
final int maxParallelism = 13;
FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
Deadline deadline = timeout.fromNow();
JobID jobID = null;
ActorGateway jobManager = null;
try {
jobManager = cluster.getLeaderGateway(deadline.timeLeft());
JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);
jobID = jobGraph.getJobID();
Object savepointResponse = null;
// wait until the operator is started
Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
FiniteDuration waitingTime = new FiniteDuration(10, TimeUnit.SECONDS);
savepointResponse = Await.result(savepointPathFuture, waitingTime);
assertTrue(String.valueOf(savepointResponse), savepointResponse instanceof JobManagerMessages.TriggerSavepointSuccess);
final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) savepointResponse).savepointPath();
Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());
Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());
Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());
assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);
Await.ready(jobRemovedFuture, deadline.timeLeft());
// job successfully removed
jobID = null;
JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);
jobID = scaledJobGraph.getJobID();
cluster.submitJobAndWait(scaledJobGraph, false);
jobID = null;
} catch (JobExecutionException exception) {
if (exception.getCause() instanceof IllegalStateException) {
// we expect a IllegalStateException wrapped
// in a JobExecutionException, because the job containing non-partitioned state
// is being rescaled
} else {
throw exception;
} finally {
// clear any left overs from a possibly failed job
if (jobID != null && jobManager != null) {
Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);
try {
Await.ready(jobRemovedFuture, timeout);
} catch (TimeoutException | InterruptedException ie) {
fail("Failed while cleaning up the cluster.");
use of scala.concurrent.duration.Deadline in project flink by apache.
the class RescalingITCase method testSavepointRescalingWithKeyedAndNonPartitionedState.
* Tests that a job with non partitioned state can be restarted from a savepoint with a
* different parallelism if the operator with non-partitioned state are not rescaled.
* @throws Exception
public void testSavepointRescalingWithKeyedAndNonPartitionedState() throws Exception {
int numberKeys = 42;
int numberElements = 1000;
int numberElements2 = 500;
int parallelism = numSlots / 2;
int parallelism2 = numSlots;
int maxParallelism = 13;
FiniteDuration timeout = new FiniteDuration(3, TimeUnit.MINUTES);
Deadline deadline = timeout.fromNow();
ActorGateway jobManager = null;
JobID jobID = null;
try {
jobManager = cluster.getLeaderGateway(deadline.timeLeft());
JobGraph jobGraph = createJobGraphWithKeyedAndNonPartitionedOperatorState(parallelism, maxParallelism, parallelism, numberKeys, numberElements, false, 100);
jobID = jobGraph.getJobID();
// wait til the sources have emitted numberElements for each key and completed a checkpoint
SubtaskIndexFlatMapper.workCompletedLatch.await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// verify the current state
Set<Tuple2<Integer, Integer>> actualResult = CollectionSink.getElementsSet();
Set<Tuple2<Integer, Integer>> expectedResult = new HashSet<>();
for (int key = 0; key < numberKeys; key++) {
int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
expectedResult.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism, keyGroupIndex), numberElements * key));
assertEquals(expectedResult, actualResult);
// clear the CollectionSink set for the restarted job
Future<Object> savepointPathFuture = jobManager.ask(new JobManagerMessages.TriggerSavepoint(jobID, Option.<String>empty()), deadline.timeLeft());
final String savepointPath = ((JobManagerMessages.TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();
Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), deadline.timeLeft());
Future<Object> cancellationResponseFuture = jobManager.ask(new JobManagerMessages.CancelJob(jobID), deadline.timeLeft());
Object cancellationResponse = Await.result(cancellationResponseFuture, deadline.timeLeft());
assertTrue(cancellationResponse instanceof JobManagerMessages.CancellationSuccess);
Await.ready(jobRemovedFuture, deadline.timeLeft());
jobID = null;
JobGraph scaledJobGraph = createJobGraphWithKeyedAndNonPartitionedOperatorState(parallelism2, maxParallelism, parallelism, numberKeys, numberElements + numberElements2, true, 100);
jobID = scaledJobGraph.getJobID();
cluster.submitJobAndWait(scaledJobGraph, false);
jobID = null;
Set<Tuple2<Integer, Integer>> actualResult2 = CollectionSink.getElementsSet();
Set<Tuple2<Integer, Integer>> expectedResult2 = new HashSet<>();
for (int key = 0; key < numberKeys; key++) {
int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
expectedResult2.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism2, keyGroupIndex), key * (numberElements + numberElements2)));
assertEquals(expectedResult2, actualResult2);
} finally {
// clear the CollectionSink set for the restarted job
// clear any left overs from a possibly failed job
if (jobID != null && jobManager != null) {
Future<Object> jobRemovedFuture = jobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobID), timeout);
try {
Await.ready(jobRemovedFuture, timeout);
} catch (TimeoutException | InterruptedException ie) {
fail("Failed while cleaning up the cluster.");
use of scala.concurrent.duration.Deadline in project flink by apache.
the class SavepointITCase method testTriggerSavepointAndResumeWithFileBasedCheckpoints.
* Triggers a savepoint for a job that uses the FsStateBackend. We expect
* that all checkpoint files are written to a new savepoint directory.
* <ol>
* <li>Submit job, wait for some progress</li>
* <li>Trigger savepoint and verify that savepoint has been created</li>
* <li>Shut down the cluster, re-submit the job from the savepoint,
* verify that the initial state has been reset, and
* all tasks are running again</li>
* <li>Cancel job, dispose the savepoint, and verify that everything
* has been cleaned up</li>
* </ol>
public void testTriggerSavepointAndResumeWithFileBasedCheckpoints() throws Exception {
// Config
final int numTaskManagers = 2;
final int numSlotsPerTaskManager = 2;
final int parallelism = numTaskManagers * numSlotsPerTaskManager;
final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
final File testRoot = folder.newFolder();
TestingCluster flink = null;
try {
// Create a test actor system
ActorSystem testActorSystem = AkkaUtils.createDefaultActorSystem();
// Flink configuration
final Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
final File checkpointDir = new File(testRoot, "checkpoints");
final File savepointRootDir = new File(testRoot, "savepoints");
if (!checkpointDir.mkdir() || !savepointRootDir.mkdirs()) {
fail("Test setup failed: failed to create temporary directories.");
// Use file based checkpoints
config.setString(CoreOptions.STATE_BACKEND, "filesystem");
config.setString(FsStateBackendFactory.CHECKPOINT_DIRECTORY_URI_CONF_KEY, checkpointDir.toURI().toString());
config.setString(FsStateBackendFactory.MEMORY_THRESHOLD_CONF_KEY, "0");
config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointRootDir.toURI().toString());
// Start Flink
flink = new TestingCluster(config);
// Submit the job
final JobGraph jobGraph = createJobGraph(parallelism, 0, 1000);
final JobID jobId = jobGraph.getJobID();
// Reset the static test job helpers
// Retrieve the job manager
ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());"Submitting job " + jobGraph.getJobID() + " in detached mode.");
flink.submitJobDetached(jobGraph);"Waiting for some progress.");
// wait for the JobManager to be ready
Future<Object> allRunning = jobManager.ask(new WaitForAllVerticesToBeRunning(jobId), deadline.timeLeft());
Await.ready(allRunning, deadline.timeLeft());
// wait for the Tasks to be ready
StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);"Triggering a savepoint.");
Future<Object> savepointPathFuture = jobManager.ask(new TriggerSavepoint(jobId, Option.<String>empty()), deadline.timeLeft());
final String savepointPath = ((TriggerSavepointSuccess) Await.result(savepointPathFuture, deadline.timeLeft())).savepointPath();"Retrieved savepoint path: " + savepointPath + ".");
// Retrieve the savepoint from the testing job manager"Requesting the savepoint.");
Future<Object> savepointFuture = jobManager.ask(new RequestSavepoint(savepointPath), deadline.timeLeft());
SavepointV1 savepoint = (SavepointV1) ((ResponseSavepoint) Await.result(savepointFuture, deadline.timeLeft())).savepoint();"Retrieved savepoint: " + savepointPath + ".");
// Shut down the Flink cluster (thereby canceling the job)"Shutting down Flink cluster.");
// - Verification START -------------------------------------------
// Only one savepoint should exist
File[] files = savepointRootDir.listFiles();
if (files != null) {
assertEquals("Savepoint not created in expected directory", 1, files.length);
assertTrue("Savepoint did not create self-contained directory", files[0].isDirectory());
File savepointDir = files[0];
File[] savepointFiles = savepointDir.listFiles();
// Expect one metadata file and one checkpoint file per stateful
// parallel subtask
String errMsg = "Did not write expected number of savepoint/checkpoint files to directory: " + Arrays.toString(savepointFiles);
assertEquals(errMsg, 1 + parallelism, savepointFiles.length);
} else {
fail("Savepoint not created in expected directory");
// We currently have the following directory layout: checkpointDir/jobId/chk-ID
File jobCheckpoints = new File(checkpointDir, jobId.toString());
if (jobCheckpoints.exists()) {
files = jobCheckpoints.listFiles();
assertNotNull("Checkpoint directory empty", files);
assertEquals("Checkpoints directory not clean: " + Arrays.toString(files), 0, files.length);
// - Verification END ---------------------------------------------
// Restart the cluster"Restarting Flink cluster.");
// Retrieve the job manager"Retrieving JobManager.");
jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());"JobManager: " + jobManager + ".");
// Reset static test helpers
// Gather all task deployment descriptors
final Throwable[] error = new Throwable[1];
final TestingCluster finalFlink = flink;
final Multimap<JobVertexID, TaskDeploymentDescriptor> tdds = HashMultimap.create();
new JavaTestKit(testActorSystem) {
new Within(deadline.timeLeft()) {
protected void run() {
try {
// Register to all submit task messages for job
for (ActorRef taskManager : finalFlink.getTaskManagersAsJava()) {
taskManager.tell(new TestingTaskManagerMessages.RegisterSubmitTaskListener(jobId), getTestActor());
// Set the savepoint path
jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));"Resubmitting job " + jobGraph.getJobID() + " with " + "savepoint path " + savepointPath + " in detached mode.");
// Submit the job
int numTasks = 0;
for (JobVertex jobVertex : jobGraph.getVertices()) {
numTasks += jobVertex.getParallelism();
// Gather the task deployment descriptors"Gathering " + numTasks + " submitted " + "TaskDeploymentDescriptor instances.");
for (int i = 0; i < numTasks; i++) {
ResponseSubmitTaskListener resp = (ResponseSubmitTaskListener) expectMsgAnyClassOf(getRemainingTime(), ResponseSubmitTaskListener.class);
TaskDeploymentDescriptor tdd = resp.tdd();"Received: " + tdd.toString() + ".");
TaskInformation taskInformation = tdd.getSerializedTaskInformation().deserializeValue(getClass().getClassLoader());
tdds.put(taskInformation.getJobVertexId(), tdd);
} catch (Throwable t) {
error[0] = t;
// - Verification START -------------------------------------------
String errMsg = "Error during gathering of TaskDeploymentDescriptors";
assertNull(errMsg, error[0]);
// have a matching task deployment descriptor.
for (TaskState taskState : savepoint.getTaskStates()) {
Collection<TaskDeploymentDescriptor> taskTdds = tdds.get(taskState.getJobVertexID());
errMsg = "Missing task for savepoint state for operator " + taskState.getJobVertexID() + ".";
assertTrue(errMsg, taskTdds.size() > 0);
assertEquals(taskState.getNumberCollectedStates(), taskTdds.size());
for (TaskDeploymentDescriptor tdd : taskTdds) {
SubtaskState subtaskState = taskState.getState(tdd.getSubtaskIndex());
errMsg = "Initial operator state mismatch.";
assertEquals(errMsg, subtaskState.getLegacyOperatorState(), tdd.getTaskStateHandles().getLegacyOperatorState());
// Await state is restored
StatefulCounter.getRestoreLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// Await some progress after restore
StatefulCounter.getProgressLatch().await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// - Verification END ---------------------------------------------"Cancelling job " + jobId + ".");
jobManager.tell(new CancelJob(jobId));"Disposing savepoint " + savepointPath + ".");
Future<Object> disposeFuture = jobManager.ask(new DisposeSavepoint(savepointPath), deadline.timeLeft());
errMsg = "Failed to dispose savepoint " + savepointPath + ".";
Object resp = Await.result(disposeFuture, deadline.timeLeft());
assertTrue(errMsg, resp.getClass() == getDisposeSavepointSuccess().getClass());
// - Verification START -------------------------------------------
// The checkpoint files
List<File> checkpointFiles = new ArrayList<>();
for (TaskState stateForTaskGroup : savepoint.getTaskStates()) {
for (SubtaskState subtaskState : stateForTaskGroup.getStates()) {
ChainedStateHandle<StreamStateHandle> streamTaskState = subtaskState.getLegacyOperatorState();
for (int i = 0; i < streamTaskState.getLength(); i++) {
if (streamTaskState.get(i) != null) {
FileStateHandle fileStateHandle = (FileStateHandle) streamTaskState.get(i);
checkpointFiles.add(new File(fileStateHandle.getFilePath().toUri()));
// The checkpoint files of the savepoint should have been discarded
for (File f : checkpointFiles) {
errMsg = "Checkpoint file " + f + " not cleaned up properly.";
assertFalse(errMsg, f.exists());
if (checkpointFiles.size() > 0) {
File parent = checkpointFiles.get(0).getParentFile();
errMsg = "Checkpoint parent directory " + parent + " not cleaned up properly.";
assertFalse(errMsg, parent.exists());
// All savepoints should have been cleaned up
errMsg = "Savepoints directory not cleaned up properly: " + Arrays.toString(savepointRootDir.listFiles()) + ".";
assertEquals(errMsg, 0, savepointRootDir.listFiles().length);
// - Verification END ---------------------------------------------
} finally {
if (flink != null) {
use of scala.concurrent.duration.Deadline in project flink by apache.
the class SavepointITCase method testSubmitWithUnknownSavepointPath.
public void testSubmitWithUnknownSavepointPath() throws Exception {
// Config
int numTaskManagers = 1;
int numSlotsPerTaskManager = 1;
int parallelism = numTaskManagers * numSlotsPerTaskManager;
// Test deadline
final Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
final File tmpDir = CommonTestUtils.createTempDirectory();
final File savepointDir = new File(tmpDir, "savepoints");
TestingCluster flink = null;
try {
// Flink configuration
final Configuration config = new Configuration();
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, numTaskManagers);
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, numSlotsPerTaskManager);
config.setString(ConfigConstants.SAVEPOINT_DIRECTORY_KEY, savepointDir.toURI().toString());"Flink configuration: " + config + ".");
// Start Flink
flink = new TestingCluster(config);"Starting Flink cluster.");
// Retrieve the job manager"Retrieving JobManager.");
ActorGateway jobManager = Await.result(flink.leaderGateway().future(), deadline.timeLeft());"JobManager: " + jobManager + ".");
// High value to ensure timeouts if restarted.
int numberOfRetries = 1000;
// Submit the job
// Long delay to ensure that the test times out if the job
// manager tries to restart the job.
final JobGraph jobGraph = createJobGraph(parallelism, numberOfRetries, 3600000);
// Set non-existing savepoint path
jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath("unknown path"));
assertEquals("unknown path", jobGraph.getSavepointRestoreSettings().getRestorePath());"Submitting job " + jobGraph.getJobID() + " in detached mode.");
try {
flink.submitJobAndWait(jobGraph, false);
} catch (Exception e) {
assertEquals(JobExecutionException.class, e.getClass());
assertEquals(FileNotFoundException.class, e.getCause().getClass());
} finally {
if (flink != null) {
use of scala.concurrent.duration.Deadline in project flink by apache.
the class ClassLoaderITCase method testDisposeSavepointWithCustomKvState.
* Tests disposal of a savepoint, which contains custom user code KvState.
public void testDisposeSavepointWithCustomKvState() throws Exception {
Deadline deadline = new FiniteDuration(100, TimeUnit.SECONDS).fromNow();
int port = testCluster.getLeaderRPCPort();
File checkpointDir = FOLDER.newFolder();
File outputDir = FOLDER.newFolder();
final PackagedProgram program = new PackagedProgram(new File(CUSTOM_KV_STATE_JAR_PATH), new String[] { CUSTOM_KV_STATE_JAR_PATH, "localhost", String.valueOf(port), String.valueOf(parallelism), checkpointDir.toURI().toString(), "5000", outputDir.toURI().toString() });
// Execute detached
Thread invokeThread = new Thread(new Runnable() {
public void run() {
try {
} catch (ProgramInvocationException ignored) {
});"Starting program invoke thread");
// The job ID
JobID jobId = null;
ActorGateway jm = testCluster.getLeaderGateway(deadline.timeLeft());"Waiting for job status running.");
// Wait for running job
while (jobId == null && deadline.hasTimeLeft()) {
Future<Object> jobsFuture = jm.ask(JobManagerMessages.getRequestRunningJobsStatus(), deadline.timeLeft());
RunningJobsStatus runningJobs = (RunningJobsStatus) Await.result(jobsFuture, deadline.timeLeft());
for (JobStatusMessage runningJob : runningJobs.getStatusMessages()) {
jobId = runningJob.getJobId();"Job running. ID: " + jobId);
// Retry if job is not available yet
if (jobId == null) {
}"Wait for all tasks to be running.");
Future<Object> allRunning = jm.ask(new WaitForAllVerticesToBeRunning(jobId), deadline.timeLeft());
Await.ready(allRunning, deadline.timeLeft());"All tasks are running.");
// Trigger savepoint
String savepointPath = null;
for (int i = 0; i < 20; i++) {"Triggering savepoint (" + (i + 1) + "/20).");
Future<Object> savepointFuture = jm.ask(new TriggerSavepoint(jobId, Option.<String>empty()), deadline.timeLeft());
Object savepointResponse = Await.result(savepointFuture, deadline.timeLeft());
if (savepointResponse.getClass() == TriggerSavepointSuccess.class) {
savepointPath = ((TriggerSavepointSuccess) savepointResponse).savepointPath();"Triggered savepoint. Path: " + savepointPath);
} else if (savepointResponse.getClass() == JobManagerMessages.TriggerSavepointFailure.class) {
Throwable cause = ((JobManagerMessages.TriggerSavepointFailure) savepointResponse).cause();"Failed to trigger savepoint. Retrying...", cause);
// This can fail if the operators are not opened yet
} else {
throw new IllegalStateException("Unexpected response to TriggerSavepoint");
assertNotNull("Failed to trigger savepoint", savepointPath);
// Upload JAR"Uploading JAR " + CUSTOM_KV_STATE_JAR_PATH + " for savepoint disposal.");
List<BlobKey> blobKeys = BlobClient.uploadJarFiles(jm, deadline.timeLeft(), testCluster.userConfiguration(), Collections.singletonList(new Path(CUSTOM_KV_STATE_JAR_PATH)));
// Dispose savepoint"Disposing savepoint at " + savepointPath);
Future<Object> disposeFuture = jm.ask(new DisposeSavepoint(savepointPath), deadline.timeLeft());
Object disposeResponse = Await.result(disposeFuture, deadline.timeLeft());
if (disposeResponse.getClass() == JobManagerMessages.getDisposeSavepointSuccess().getClass()) {
// Success :-)"Disposed savepoint at " + savepointPath);
} else if (disposeResponse instanceof DisposeSavepointFailure) {
throw new IllegalStateException("Failed to dispose savepoint " + disposeResponse);
} else {
throw new IllegalStateException("Unexpected response to DisposeSavepoint");