use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class AbstractOperatorRestoreTestBase method testMigrationAndRestore.
@Test
public void testMigrationAndRestore() throws Throwable {
ClusterClient<?> clusterClient = cluster.getClusterClient();
final Deadline deadline = Deadline.now().plus(TEST_TIMEOUT);
// submit job with old version savepoint and create a migrated savepoint in the new version
String savepointPath = migrateJob(clusterClient, deadline);
// restore from migrated new version savepoint
restoreJob(clusterClient, deadline, savepointPath);
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class UnalignedCheckpointStressITCase method runStressTest.
@Test
public void runStressTest() throws Exception {
Deadline deadline = Deadline.fromNow(Duration.ofMillis(TEST_DURATION));
Optional<File> externalizedCheckpoint = Optional.empty();
while (deadline.hasTimeLeft()) {
externalizedCheckpoint = Optional.of(runAndTakeExternalCheckpoint(externalizedCheckpoint));
cleanDirectoryExcept(externalizedCheckpoint.get());
}
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class RescalingITCase method testSavepointRescalingKeyedState.
/**
* Tests that a job with purely keyed state can be restarted from a savepoint with a different
* parallelism.
*/
public void testSavepointRescalingKeyedState(boolean scaleOut, boolean deriveMaxParallelism) throws Exception {
final int numberKeys = 42;
final int numberElements = 1000;
final int numberElements2 = 500;
final int parallelism = scaleOut ? numSlots / 2 : numSlots;
final int parallelism2 = scaleOut ? numSlots : numSlots / 2;
final int maxParallelism = 13;
Duration timeout = Duration.ofMinutes(3);
Deadline deadline = Deadline.now().plus(timeout);
ClusterClient<?> client = cluster.getClusterClient();
try {
JobGraph jobGraph = createJobGraphWithKeyedState(parallelism, maxParallelism, numberKeys, numberElements, false, 100);
final JobID jobID = jobGraph.getJobID();
client.submitJob(jobGraph).get();
// wait til the sources have emitted numberElements for each key and completed a
// checkpoint
assertTrue(SubtaskIndexFlatMapper.workCompletedLatch.await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
// verify the current state
Set<Tuple2<Integer, Integer>> actualResult = CollectionSink.getElementsSet();
Set<Tuple2<Integer, Integer>> expectedResult = new HashSet<>();
for (int key = 0; key < numberKeys; key++) {
int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
expectedResult.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism, keyGroupIndex), numberElements * key));
}
assertEquals(expectedResult, actualResult);
// clear the CollectionSink set for the restarted job
CollectionSink.clearElementsSet();
waitForAllTaskRunning(cluster.getMiniCluster(), jobGraph.getJobID(), false);
CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null, SavepointFormatType.CANONICAL);
final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
client.cancel(jobID).get();
while (!getRunningJobs(client).isEmpty()) {
Thread.sleep(50);
}
int restoreMaxParallelism = deriveMaxParallelism ? JobVertex.MAX_PARALLELISM_DEFAULT : maxParallelism;
JobGraph scaledJobGraph = createJobGraphWithKeyedState(parallelism2, restoreMaxParallelism, numberKeys, numberElements2, true, 100);
scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
submitJobAndWaitForResult(client, scaledJobGraph, getClass().getClassLoader());
Set<Tuple2<Integer, Integer>> actualResult2 = CollectionSink.getElementsSet();
Set<Tuple2<Integer, Integer>> expectedResult2 = new HashSet<>();
for (int key = 0; key < numberKeys; key++) {
int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
expectedResult2.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism2, keyGroupIndex), key * (numberElements + numberElements2)));
}
assertEquals(expectedResult2, actualResult2);
} finally {
// clear the CollectionSink set for the restarted job
CollectionSink.clearElementsSet();
}
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class RescalingITCase method testSavepointRescalingWithKeyedAndNonPartitionedState.
/**
* Tests that a job with non partitioned state can be restarted from a savepoint with a
* different parallelism if the operator with non-partitioned state are not rescaled.
*
* @throws Exception
*/
@Test
public void testSavepointRescalingWithKeyedAndNonPartitionedState() throws Exception {
int numberKeys = 42;
int numberElements = 1000;
int numberElements2 = 500;
int parallelism = numSlots / 2;
int parallelism2 = numSlots;
int maxParallelism = 13;
Duration timeout = Duration.ofMinutes(3);
Deadline deadline = Deadline.now().plus(timeout);
ClusterClient<?> client = cluster.getClusterClient();
try {
JobGraph jobGraph = createJobGraphWithKeyedAndNonPartitionedOperatorState(parallelism, maxParallelism, parallelism, numberKeys, numberElements, false, 100);
final JobID jobID = jobGraph.getJobID();
// make sure the job does not finish before we take the savepoint
StateSourceBase.canFinishLatch = new CountDownLatch(1);
client.submitJob(jobGraph).get();
// wait til the sources have emitted numberElements for each key and completed a
// checkpoint
assertTrue(SubtaskIndexFlatMapper.workCompletedLatch.await(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS));
// verify the current state
Set<Tuple2<Integer, Integer>> actualResult = CollectionSink.getElementsSet();
Set<Tuple2<Integer, Integer>> expectedResult = new HashSet<>();
for (int key = 0; key < numberKeys; key++) {
int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
expectedResult.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism, keyGroupIndex), numberElements * key));
}
assertEquals(expectedResult, actualResult);
// clear the CollectionSink set for the restarted job
CollectionSink.clearElementsSet();
waitForAllTaskRunning(cluster.getMiniCluster(), jobGraph.getJobID(), false);
CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null, SavepointFormatType.CANONICAL);
final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// we took a savepoint, the job can finish now
StateSourceBase.canFinishLatch.countDown();
client.cancel(jobID).get();
while (!getRunningJobs(client).isEmpty()) {
Thread.sleep(50);
}
JobGraph scaledJobGraph = createJobGraphWithKeyedAndNonPartitionedOperatorState(parallelism2, maxParallelism, parallelism, numberKeys, numberElements + numberElements2, true, 100);
scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
submitJobAndWaitForResult(client, scaledJobGraph, getClass().getClassLoader());
Set<Tuple2<Integer, Integer>> actualResult2 = CollectionSink.getElementsSet();
Set<Tuple2<Integer, Integer>> expectedResult2 = new HashSet<>();
for (int key = 0; key < numberKeys; key++) {
int keyGroupIndex = KeyGroupRangeAssignment.assignToKeyGroup(key, maxParallelism);
expectedResult2.add(Tuple2.of(KeyGroupRangeAssignment.computeOperatorIndexForKeyGroup(maxParallelism, parallelism2, keyGroupIndex), key * (numberElements + numberElements2)));
}
assertEquals(expectedResult2, actualResult2);
} finally {
// clear the CollectionSink set for the restarted job
CollectionSink.clearElementsSet();
}
}
use of org.apache.flink.api.common.time.Deadline in project flink by apache.
the class RescalingITCase method testSavepointRescalingNonPartitionedStateCausesException.
/**
* Tests that a job cannot be restarted from a savepoint with a different parallelism if the
* rescaled operator has non-partitioned state.
*
* @throws Exception
*/
@Test
public void testSavepointRescalingNonPartitionedStateCausesException() throws Exception {
final int parallelism = numSlots / 2;
final int parallelism2 = numSlots;
final int maxParallelism = 13;
Duration timeout = Duration.ofMinutes(3);
Deadline deadline = Deadline.now().plus(timeout);
ClusterClient<?> client = cluster.getClusterClient();
try {
JobGraph jobGraph = createJobGraphWithOperatorState(parallelism, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);
// make sure the job does not finish before we take the savepoint
StateSourceBase.canFinishLatch = new CountDownLatch(1);
final JobID jobID = jobGraph.getJobID();
client.submitJob(jobGraph).get();
// wait until the operator is started
waitForAllTaskRunning(cluster.getMiniCluster(), jobGraph.getJobID(), false);
// wait until the operator handles some data
StateSourceBase.workStartedLatch.await();
CompletableFuture<String> savepointPathFuture = client.triggerSavepoint(jobID, null, SavepointFormatType.CANONICAL);
final String savepointPath = savepointPathFuture.get(deadline.timeLeft().toMillis(), TimeUnit.MILLISECONDS);
// we took a savepoint, the job can finish now
StateSourceBase.canFinishLatch.countDown();
client.cancel(jobID).get();
while (!getRunningJobs(client).isEmpty()) {
Thread.sleep(50);
}
// job successfully removed
JobGraph scaledJobGraph = createJobGraphWithOperatorState(parallelism2, maxParallelism, OperatorCheckpointMethod.NON_PARTITIONED);
scaledJobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(savepointPath));
submitJobAndWaitForResult(client, scaledJobGraph, getClass().getClassLoader());
} catch (JobExecutionException exception) {
if (exception.getCause() instanceof IllegalStateException) {
// we expect a IllegalStateException wrapped
// in a JobExecutionException, because the job containing non-partitioned state
// is being rescaled
} else {
throw exception;
}
}
}
Aggregations