use of org.apache.flink.runtime.testutils.CommonTestUtils.waitForAllTaskRunning in project flink by apache.
the class SavepointITCase method testTriggerSavepointAndResumeWithNoClaim.
@Test
@Ignore("Disabling this test because it regularly fails on AZP. See FLINK-25427.")
public void testTriggerSavepointAndResumeWithNoClaim() throws Exception {
final int numTaskManagers = 2;
final int numSlotsPerTaskManager = 2;
final int parallelism = numTaskManagers * numSlotsPerTaskManager;
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setStateBackend(new EmbeddedRocksDBStateBackend(true));
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
env.getCheckpointConfig().setCheckpointStorage(folder.newFolder().toURI());
env.setParallelism(parallelism);
final SharedReference<CountDownLatch> counter = sharedObjects.add(new CountDownLatch(10_000));
env.fromSequence(1, Long.MAX_VALUE).keyBy(i -> i % parallelism).process(new KeyedProcessFunction<Long, Long, Long>() {
private ListState<Long> last;
@Override
public void open(Configuration parameters) {
// we use list state here to create sst files of a significant size
// if sst files do not reach certain thresholds they are not stored
// in files, but as a byte stream in checkpoints metadata
last = getRuntimeContext().getListState(new ListStateDescriptor<>("last", BasicTypeInfo.LONG_TYPE_INFO));
}
@Override
public void processElement(Long value, KeyedProcessFunction<Long, Long, Long>.Context ctx, Collector<Long> out) throws Exception {
last.add(value);
out.collect(value);
}
}).addSink(new SinkFunction<Long>() {
@Override
public void invoke(Long value) {
counter.consumeSync(CountDownLatch::countDown);
}
}).setParallelism(1);
final JobGraph jobGraph = env.getStreamGraph().getJobGraph();
MiniClusterWithClientResource cluster = new MiniClusterWithClientResource(new MiniClusterResourceConfiguration.Builder().setNumberTaskManagers(numTaskManagers).setNumberSlotsPerTaskManager(numSlotsPerTaskManager).build());
cluster.before();
try {
final JobID jobID1 = new JobID();
jobGraph.setJobID(jobID1);
cluster.getClusterClient().submitJob(jobGraph).get();
CommonTestUtils.waitForAllTaskRunning(cluster.getMiniCluster(), jobID1, false);
// wait for some records to be processed before taking the checkpoint
counter.get().await();
final String firstCheckpoint = cluster.getMiniCluster().triggerCheckpoint(jobID1).get();
cluster.getClusterClient().cancel(jobID1).get();
jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(firstCheckpoint, false, RestoreMode.NO_CLAIM));
final JobID jobID2 = new JobID();
jobGraph.setJobID(jobID2);
cluster.getClusterClient().submitJob(jobGraph).get();
CommonTestUtils.waitForAllTaskRunning(cluster.getMiniCluster(), jobID2, false);
String secondCheckpoint = cluster.getMiniCluster().triggerCheckpoint(jobID2).get();
cluster.getClusterClient().cancel(jobID2).get();
// delete the checkpoint we restored from
FileUtils.deleteDirectory(Paths.get(new URI(firstCheckpoint)).getParent().toFile());
// we should be able to restore from the second checkpoint even though it has been built
// on top of the first checkpoint
jobGraph.setSavepointRestoreSettings(SavepointRestoreSettings.forPath(secondCheckpoint, false, RestoreMode.NO_CLAIM));
final JobID jobID3 = new JobID();
jobGraph.setJobID(jobID3);
cluster.getClusterClient().submitJob(jobGraph).get();
CommonTestUtils.waitForAllTaskRunning(cluster.getMiniCluster(), jobID3, false);
} finally {
cluster.after();
}
}
Aggregations