Search in sources :

Example 1 with RetryOnFailure

use of org.apache.flink.testutils.junit.RetryOnFailure in project flink by apache.

the class JobManagerHACheckpointRecoveryITCase method testCheckpointedStreamingSumProgram.

/**
	 * Simple checkpointed streaming sum.
	 *
	 * <p>The sources (Parallelism) count until sequenceEnd. The sink (1) sums up all counts and
	 * returns it to the main thread via a static variable. We wait until some checkpoints are
	 * completed and sanity check that the sources recover with an updated state to make sure that
	 * this test actually tests something.
	 */
@Test
@RetryOnFailure(times = 1)
public void testCheckpointedStreamingSumProgram() throws Exception {
    // Config
    final int checkpointingInterval = 200;
    final int sequenceEnd = 5000;
    final long expectedSum = Parallelism * sequenceEnd * (sequenceEnd + 1) / 2;
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
    env.setParallelism(Parallelism);
    env.enableCheckpointing(checkpointingInterval);
    env.addSource(new CheckpointedSequenceSource(sequenceEnd)).addSink(new CountingSink()).setParallelism(1);
    JobGraph jobGraph = env.getStreamGraph().getJobGraph();
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getAbsoluteFile().toURI().toString());
    config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, Parallelism);
    ActorSystem testSystem = null;
    final JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Test actor system
        testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        {
            // Initial submission
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            // Get the leader ref
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            // Submit the job in detached mode
            leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
        }
        CompletedCheckpointsLatch.await();
        // Kill the leading job manager process
        leadingJobManagerProcess.destroy();
        {
            // Recovery by the standby JobManager
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Wait to finish
        FinalCountLatch.await();
        assertEquals(expectedSum, (long) FinalCount.get());
        for (int i = 0; i < Parallelism; i++) {
            assertNotEquals(0, RecoveredStates.get(i));
        }
    } catch (Throwable t) {
        // Reset all static state for test retries
        CompletedCheckpointsLatch = new CountDownLatch(2);
        RecoveredStates = new AtomicLongArray(Parallelism);
        FinalCountLatch = new CountDownLatch(1);
        FinalCount = new AtomicReference<>();
        LastElement = -1;
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testSystem != null) {
            testSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Deadline(scala.concurrent.duration.Deadline) AtomicReference(java.util.concurrent.atomic.AtomicReference) CountDownLatch(java.util.concurrent.CountDownLatch) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Some(scala.Some) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) AtomicLongArray(java.util.concurrent.atomic.AtomicLongArray) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Example 2 with RetryOnFailure

use of org.apache.flink.testutils.junit.RetryOnFailure in project flink by apache.

the class WikipediaEditsSourceTest method testWikipediaEditsSource.

/**
 * We first check the connection to the IRC server. If it fails, this test is ignored.
 */
@Test
@RetryOnFailure(times = 1)
public void testWikipediaEditsSource() throws Exception {
    if (canConnect(1, TimeUnit.SECONDS)) {
        final Time testTimeout = Time.seconds(60);
        final WikipediaEditsSource wikipediaEditsSource = new WikipediaEditsSource();
        ExecutorService executorService = null;
        try {
            executorService = Executors.newSingleThreadExecutor();
            BlockingQueue<Object> collectedEvents = new ArrayBlockingQueue<>(1);
            AtomicReference<Exception> asyncError = new AtomicReference<>();
            // Execute the source in a different thread and collect events into the queue.
            // We do this in a separate thread in order to not block the main test thread
            // indefinitely in case that something bad happens (like not receiving any
            // events)
            executorService.execute(() -> {
                try {
                    wikipediaEditsSource.run(new CollectingSourceContext<>(collectedEvents));
                } catch (Exception e) {
                    boolean interrupted = e instanceof InterruptedException;
                    if (!interrupted) {
                        LOG.warn("Failure in WikipediaEditsSource", e);
                    }
                    asyncError.compareAndSet(null, e);
                }
            });
            long deadline = deadlineNanos(testTimeout);
            Object event = null;
            Exception error = null;
            // Check event or error
            while (event == null && error == null && System.nanoTime() < deadline) {
                event = collectedEvents.poll(1, TimeUnit.SECONDS);
                error = asyncError.get();
            }
            if (error != null) {
                // We don't use assertNull, because we want to include the error message
                fail("Failure in WikipediaEditsSource: " + error.getMessage());
            }
            assertNotNull("Did not receive a WikipediaEditEvent within the desired timeout", event);
            assertTrue("Received unexpected event " + event, event instanceof WikipediaEditEvent);
        } finally {
            wikipediaEditsSource.cancel();
            if (executorService != null) {
                executorService.shutdownNow();
                executorService.awaitTermination(1, TimeUnit.SECONDS);
            }
        }
    } else {
        LOG.info("Skipping test, because not able to connect to IRC server.");
    }
}
Also used : Time(org.apache.flink.api.common.time.Time) AtomicReference(java.util.concurrent.atomic.AtomicReference) ArrayBlockingQueue(java.util.concurrent.ArrayBlockingQueue) ExecutorService(java.util.concurrent.ExecutorService) Test(org.junit.Test) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Example 3 with RetryOnFailure

use of org.apache.flink.testutils.junit.RetryOnFailure in project flink by apache.

the class RetryExtension method supportsTestTemplate.

@Override
public boolean supportsTestTemplate(ExtensionContext context) {
    RetryOnFailure retryOnFailure = getRetryAnnotation(context, RetryOnFailure.class);
    RetryOnException retryOnException = getRetryAnnotation(context, RetryOnException.class);
    return retryOnException != null || retryOnFailure != null;
}
Also used : RetryOnException(org.apache.flink.testutils.junit.RetryOnException) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Example 4 with RetryOnFailure

use of org.apache.flink.testutils.junit.RetryOnFailure in project flink by apache.

the class JobManagerHACheckpointRecoveryITCase method testCheckpointRecoveryFailure.

/**
	 * Tests that the JobManager logs failures during recovery properly.
	 *
	 * @see <a href="https://issues.apache.org/jira/browse/FLINK-3185">FLINK-3185</a>
	 */
@Test
@RetryOnFailure(times = 1)
public void testCheckpointRecoveryFailure() throws Exception {
    final Deadline testDeadline = TestTimeOut.fromNow();
    final String zooKeeperQuorum = ZooKeeper.getConnectString();
    final String fileStateBackendPath = FileStateBackendBasePath.getAbsoluteFile().toString();
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(zooKeeperQuorum, fileStateBackendPath);
    config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
    JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    ActorSystem testActorSystem = null;
    try {
        // Test actor system
        testActorSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        // Get the leader
        leaderListener.waitForNewLeader(testDeadline.timeLeft().toMillis());
        String leaderAddress = leaderListener.getAddress();
        UUID leaderId = leaderListener.getLeaderSessionID();
        // Get the leader ref
        ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testActorSystem, testDeadline.timeLeft());
        ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        JobManagerProcess nonLeadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(testDeadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
            nonLeadingJobManagerProcess = jobManagerProcess[1];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
            nonLeadingJobManagerProcess = jobManagerProcess[0];
        }
        // Blocking JobGraph
        JobVertex blockingVertex = new JobVertex("Blocking vertex");
        blockingVertex.setInvokableClass(BlockingNoOpInvokable.class);
        JobGraph jobGraph = new JobGraph(blockingVertex);
        // Submit the job in detached mode
        leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
        // Wait for the job to be running
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, testDeadline.timeLeft());
        // Remove all files
        FileUtils.deleteDirectory(FileStateBackendBasePath);
        // Kill the leader
        leadingJobManagerProcess.destroy();
        // Verify that the job manager logs the failed recovery. We can not
        // do more at this point. :(
        boolean success = false;
        while (testDeadline.hasTimeLeft()) {
            String output = nonLeadingJobManagerProcess.getProcessOutput();
            if (output != null) {
                if (output.contains("Failed to recover job") && output.contains("java.io.FileNotFoundException")) {
                    success = true;
                    break;
                }
            } else {
                log.warn("No process output available.");
            }
            Thread.sleep(500);
        }
        assertTrue("Did not find expected output in logs.", success);
    } catch (Throwable t) {
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testActorSystem != null) {
            testActorSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) Deadline(scala.concurrent.duration.Deadline) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Some(scala.Some) JobVertex(org.apache.flink.runtime.jobgraph.JobVertex) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Test(org.junit.Test) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Example 5 with RetryOnFailure

use of org.apache.flink.testutils.junit.RetryOnFailure in project flink by apache.

the class RetryExtension method provideTestTemplateInvocationContexts.

@Override
public Stream<TestTemplateInvocationContext> provideTestTemplateInvocationContexts(ExtensionContext context) {
    RetryOnFailure retryOnFailure = getRetryAnnotation(context, RetryOnFailure.class);
    RetryOnException retryOnException = getRetryAnnotation(context, RetryOnException.class);
    // sanity check that we don't use both annotations
    if (retryOnFailure != null && retryOnException != null) {
        throw new IllegalArgumentException("You cannot combine the RetryOnFailure and RetryOnException annotations.");
    }
    Map<String, RetryStrategy> testLog = (Map<String, RetryStrategy>) context.getStore(RETRY_NAMESPACE).getOrComputeIfAbsent(RETRY_KEY, key -> new HashMap<>());
    int totalTimes;
    if (retryOnException != null) {
        totalTimes = retryOnException.times() + 1;
        testLog.put(getTestMethodKey(context), new RetryOnExceptionStrategy(totalTimes, retryOnException.exception()));
    } else if (retryOnFailure != null) {
        totalTimes = retryOnFailure.times() + 1;
        testLog.put(getTestMethodKey(context), new RetryOnFailureStrategy(totalTimes));
    } else {
        throw new IllegalArgumentException("Unsupported retry strategy.");
    }
    return IntStream.rangeClosed(1, totalTimes).mapToObj(i -> new RetryContext(i, totalTimes));
}
Also used : RetryOnException(org.apache.flink.testutils.junit.RetryOnException) IntStream(java.util.stream.IntStream) RetryOnExceptionStrategy(org.apache.flink.testutils.junit.extensions.retry.strategy.RetryOnExceptionStrategy) Arrays(java.util.Arrays) RetryStrategy(org.apache.flink.testutils.junit.extensions.retry.strategy.RetryStrategy) HashMap(java.util.HashMap) ExtensionContext(org.junit.jupiter.api.extension.ExtensionContext) Extension(org.junit.jupiter.api.extension.Extension) List(java.util.List) Stream(java.util.stream.Stream) AfterAllCallback(org.junit.jupiter.api.extension.AfterAllCallback) TestTemplateInvocationContext(org.junit.jupiter.api.extension.TestTemplateInvocationContext) Map(java.util.Map) Annotation(java.lang.annotation.Annotation) TestTemplateInvocationContextProvider(org.junit.jupiter.api.extension.TestTemplateInvocationContextProvider) RetryOnFailureStrategy(org.apache.flink.testutils.junit.extensions.retry.strategy.RetryOnFailureStrategy) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure) Method(java.lang.reflect.Method) HashMap(java.util.HashMap) RetryOnExceptionStrategy(org.apache.flink.testutils.junit.extensions.retry.strategy.RetryOnExceptionStrategy) RetryOnException(org.apache.flink.testutils.junit.RetryOnException) RetryStrategy(org.apache.flink.testutils.junit.extensions.retry.strategy.RetryStrategy) HashMap(java.util.HashMap) Map(java.util.Map) RetryOnFailureStrategy(org.apache.flink.testutils.junit.extensions.retry.strategy.RetryOnFailureStrategy) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Aggregations

RetryOnFailure (org.apache.flink.testutils.junit.RetryOnFailure)5 Test (org.junit.Test)3 ActorRef (akka.actor.ActorRef)2 ActorSystem (akka.actor.ActorSystem)2 UUID (java.util.UUID)2 AtomicReference (java.util.concurrent.atomic.AtomicReference)2 Configuration (org.apache.flink.configuration.Configuration)2 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)2 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)2 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)2 TestingListener (org.apache.flink.runtime.leaderelection.TestingListener)2 LeaderRetrievalService (org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService)2 SubmitJob (org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob)2 JobManagerProcess (org.apache.flink.runtime.testutils.JobManagerProcess)2 RetryOnException (org.apache.flink.testutils.junit.RetryOnException)2 Some (scala.Some)2 Deadline (scala.concurrent.duration.Deadline)2 Annotation (java.lang.annotation.Annotation)1 Method (java.lang.reflect.Method)1 Arrays (java.util.Arrays)1