use of org.apache.flink.testutils.junit.RetryOnFailure in project flink by apache.
the class JobManagerHACheckpointRecoveryITCase method testCheckpointedStreamingSumProgram.
/**
* Simple checkpointed streaming sum.
*
* <p>The sources (Parallelism) count until sequenceEnd. The sink (1) sums up all counts and
* returns it to the main thread via a static variable. We wait until some checkpoints are
* completed and sanity check that the sources recover with an updated state to make sure that
* this test actually tests something.
*/
@Test
@RetryOnFailure(times = 1)
public void testCheckpointedStreamingSumProgram() throws Exception {
// Config
final int checkpointingInterval = 200;
final int sequenceEnd = 5000;
final long expectedSum = Parallelism * sequenceEnd * (sequenceEnd + 1) / 2;
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
env.setParallelism(Parallelism);
env.enableCheckpointing(checkpointingInterval);
env.addSource(new CheckpointedSequenceSource(sequenceEnd)).addSink(new CountingSink()).setParallelism(1);
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getAbsoluteFile().toURI().toString());
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, Parallelism);
ActorSystem testSystem = null;
final JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Test actor system
testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
{
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
// Submit the job in detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
}
CompletedCheckpointsLatch.await();
// Kill the leading job manager process
leadingJobManagerProcess.destroy();
{
// Recovery by the standby JobManager
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Wait to finish
FinalCountLatch.await();
assertEquals(expectedSum, (long) FinalCount.get());
for (int i = 0; i < Parallelism; i++) {
assertNotEquals(0, RecoveredStates.get(i));
}
} catch (Throwable t) {
// Reset all static state for test retries
CompletedCheckpointsLatch = new CountDownLatch(2);
RecoveredStates = new AtomicLongArray(Parallelism);
FinalCountLatch = new CountDownLatch(1);
FinalCount = new AtomicReference<>();
LastElement = -1;
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testSystem != null) {
testSystem.shutdown();
}
}
}
use of org.apache.flink.testutils.junit.RetryOnFailure in project flink by apache.
the class WikipediaEditsSourceTest method testWikipediaEditsSource.
/**
* We first check the connection to the IRC server. If it fails, this test is ignored.
*/
@Test
@RetryOnFailure(times = 1)
public void testWikipediaEditsSource() throws Exception {
if (canConnect(1, TimeUnit.SECONDS)) {
final Time testTimeout = Time.seconds(60);
final WikipediaEditsSource wikipediaEditsSource = new WikipediaEditsSource();
ExecutorService executorService = null;
try {
executorService = Executors.newSingleThreadExecutor();
BlockingQueue<Object> collectedEvents = new ArrayBlockingQueue<>(1);
AtomicReference<Exception> asyncError = new AtomicReference<>();
// Execute the source in a different thread and collect events into the queue.
// We do this in a separate thread in order to not block the main test thread
// indefinitely in case that something bad happens (like not receiving any
// events)
executorService.execute(() -> {
try {
wikipediaEditsSource.run(new CollectingSourceContext<>(collectedEvents));
} catch (Exception e) {
boolean interrupted = e instanceof InterruptedException;
if (!interrupted) {
LOG.warn("Failure in WikipediaEditsSource", e);
}
asyncError.compareAndSet(null, e);
}
});
long deadline = deadlineNanos(testTimeout);
Object event = null;
Exception error = null;
// Check event or error
while (event == null && error == null && System.nanoTime() < deadline) {
event = collectedEvents.poll(1, TimeUnit.SECONDS);
error = asyncError.get();
}
if (error != null) {
// We don't use assertNull, because we want to include the error message
fail("Failure in WikipediaEditsSource: " + error.getMessage());
}
assertNotNull("Did not receive a WikipediaEditEvent within the desired timeout", event);
assertTrue("Received unexpected event " + event, event instanceof WikipediaEditEvent);
} finally {
wikipediaEditsSource.cancel();
if (executorService != null) {
executorService.shutdownNow();
executorService.awaitTermination(1, TimeUnit.SECONDS);
}
}
} else {
LOG.info("Skipping test, because not able to connect to IRC server.");
}
}
use of org.apache.flink.testutils.junit.RetryOnFailure in project flink by apache.
the class RetryExtension method supportsTestTemplate.
@Override
public boolean supportsTestTemplate(ExtensionContext context) {
RetryOnFailure retryOnFailure = getRetryAnnotation(context, RetryOnFailure.class);
RetryOnException retryOnException = getRetryAnnotation(context, RetryOnException.class);
return retryOnException != null || retryOnFailure != null;
}
use of org.apache.flink.testutils.junit.RetryOnFailure in project flink by apache.
the class JobManagerHACheckpointRecoveryITCase method testCheckpointRecoveryFailure.
/**
* Tests that the JobManager logs failures during recovery properly.
*
* @see <a href="https://issues.apache.org/jira/browse/FLINK-3185">FLINK-3185</a>
*/
@Test
@RetryOnFailure(times = 1)
public void testCheckpointRecoveryFailure() throws Exception {
final Deadline testDeadline = TestTimeOut.fromNow();
final String zooKeeperQuorum = ZooKeeper.getConnectString();
final String fileStateBackendPath = FileStateBackendBasePath.getAbsoluteFile().toString();
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeperQuorum, fileStateBackendPath);
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
ActorSystem testActorSystem = null;
try {
// Test actor system
testActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
// Get the leader
leaderListener.waitForNewLeader(testDeadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, testDeadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
JobManagerProcess nonLeadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(testDeadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
nonLeadingJobManagerProcess = jobManagerProcess[1];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
nonLeadingJobManagerProcess = jobManagerProcess[0];
}
// Blocking JobGraph
JobVertex blockingVertex = new JobVertex("Blocking vertex");
blockingVertex.setInvokableClass(BlockingNoOpInvokable.class);
JobGraph jobGraph = new JobGraph(blockingVertex);
// Submit the job in detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
// Wait for the job to be running
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, testDeadline.timeLeft());
// Remove all files
FileUtils.deleteDirectory(FileStateBackendBasePath);
// Kill the leader
leadingJobManagerProcess.destroy();
// Verify that the job manager logs the failed recovery. We can not
// do more at this point. :(
boolean success = false;
while (testDeadline.hasTimeLeft()) {
String output = nonLeadingJobManagerProcess.getProcessOutput();
if (output != null) {
if (output.contains("Failed to recover job") && output.contains("java.io.FileNotFoundException")) {
success = true;
break;
}
} else {
log.warn("No process output available.");
}
Thread.sleep(500);
}
assertTrue("Did not find expected output in logs.", success);
} catch (Throwable t) {
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testActorSystem != null) {
testActorSystem.shutdown();
}
}
}
use of org.apache.flink.testutils.junit.RetryOnFailure in project flink by apache.
the class RetryExtension method provideTestTemplateInvocationContexts.
@Override
public Stream<TestTemplateInvocationContext> provideTestTemplateInvocationContexts(ExtensionContext context) {
RetryOnFailure retryOnFailure = getRetryAnnotation(context, RetryOnFailure.class);
RetryOnException retryOnException = getRetryAnnotation(context, RetryOnException.class);
// sanity check that we don't use both annotations
if (retryOnFailure != null && retryOnException != null) {
throw new IllegalArgumentException("You cannot combine the RetryOnFailure and RetryOnException annotations.");
}
Map<String, RetryStrategy> testLog = (Map<String, RetryStrategy>) context.getStore(RETRY_NAMESPACE).getOrComputeIfAbsent(RETRY_KEY, key -> new HashMap<>());
int totalTimes;
if (retryOnException != null) {
totalTimes = retryOnException.times() + 1;
testLog.put(getTestMethodKey(context), new RetryOnExceptionStrategy(totalTimes, retryOnException.exception()));
} else if (retryOnFailure != null) {
totalTimes = retryOnFailure.times() + 1;
testLog.put(getTestMethodKey(context), new RetryOnFailureStrategy(totalTimes));
} else {
throw new IllegalArgumentException("Unsupported retry strategy.");
}
return IntStream.rangeClosed(1, totalTimes).mapToObj(i -> new RetryContext(i, totalTimes));
}
Aggregations