Search in sources :

Example 6 with Deadline

use of scala.concurrent.duration.Deadline in project flink by apache.

the class ZooKeeperLeaderElectionTest method testZooKeeperReelection.

/**
	 * Tests repeatedly the reelection of still available LeaderContender. After a contender has
	 * been elected as the leader, it is removed. This forces the ZooKeeperLeaderElectionService
	 * to elect a new leader.
	 */
@Test
public void testZooKeeperReelection() throws Exception {
    Configuration configuration = new Configuration();
    configuration.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, testingServer.getConnectString());
    configuration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
    Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
    int num = 20;
    ZooKeeperLeaderElectionService[] leaderElectionService = new ZooKeeperLeaderElectionService[num];
    TestingContender[] contenders = new TestingContender[num];
    ZooKeeperLeaderRetrievalService leaderRetrievalService = null;
    TestingListener listener = new TestingListener();
    try {
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(configuration);
        LOG.debug("Start leader retrieval service for the TestingListener.");
        leaderRetrievalService.start(listener);
        for (int i = 0; i < num; i++) {
            leaderElectionService[i] = ZooKeeperUtils.createLeaderElectionService(configuration);
            contenders[i] = new TestingContender(TEST_URL + "_" + i, leaderElectionService[i]);
            LOG.debug("Start leader election service for contender #{}.", i);
            leaderElectionService[i].start(contenders[i]);
        }
        String pattern = TEST_URL + "_" + "(\\d+)";
        Pattern regex = Pattern.compile(pattern);
        int numberSeenLeaders = 0;
        while (deadline.hasTimeLeft() && numberSeenLeaders < num) {
            LOG.debug("Wait for new leader #{}.", numberSeenLeaders);
            String address = listener.waitForNewLeader(deadline.timeLeft().toMillis());
            Matcher m = regex.matcher(address);
            if (m.find()) {
                int index = Integer.parseInt(m.group(1));
                TestingContender contender = contenders[index];
                // check that the retrieval service has retrieved the correct leader
                if (address.equals(contender.getAddress()) && listener.getLeaderSessionID().equals(contender.getLeaderSessionID())) {
                    // kill the election service of the leader
                    LOG.debug("Stop leader election service of contender #{}.", numberSeenLeaders);
                    leaderElectionService[index].stop();
                    leaderElectionService[index] = null;
                    numberSeenLeaders++;
                }
            } else {
                fail("Did not find the leader's index.");
            }
        }
        assertFalse(deadline.isOverdue());
        assertEquals(num, numberSeenLeaders);
    } finally {
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        for (ZooKeeperLeaderElectionService electionService : leaderElectionService) {
            if (electionService != null) {
                electionService.stop();
            }
        }
    }
}
Also used : Pattern(java.util.regex.Pattern) Configuration(org.apache.flink.configuration.Configuration) Matcher(java.util.regex.Matcher) Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) ZooKeeperLeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.ZooKeeperLeaderRetrievalService) Test(org.junit.Test)

Example 7 with Deadline

use of scala.concurrent.duration.Deadline in project flink by apache.

the class OneInputStreamTaskTest method testSnapshottingAndRestoring.

/**
	 * Tests that the stream operator can snapshot and restore the operator state of chained
	 * operators
	 */
@Test
public void testSnapshottingAndRestoring() throws Exception {
    final Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
    final OneInputStreamTask<String, String> streamTask = new OneInputStreamTask<String, String>();
    final OneInputStreamTaskTestHarness<String, String> testHarness = new OneInputStreamTaskTestHarness<String, String>(streamTask, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
    testHarness.setupOutputForSingletonOperatorChain();
    IdentityKeySelector<String> keySelector = new IdentityKeySelector<>();
    testHarness.configureForKeyedStream(keySelector, BasicTypeInfo.STRING_TYPE_INFO);
    long checkpointId = 1L;
    long checkpointTimestamp = 1L;
    long recoveryTimestamp = 3L;
    long seed = 2L;
    int numberChainedTasks = 11;
    StreamConfig streamConfig = testHarness.getStreamConfig();
    configureChainedTestingStreamOperator(streamConfig, numberChainedTasks, seed, recoveryTimestamp);
    AcknowledgeStreamMockEnvironment env = new AcknowledgeStreamMockEnvironment(testHarness.jobConfig, testHarness.taskConfig, testHarness.executionConfig, testHarness.memorySize, new MockInputSplitProvider(), testHarness.bufferSize);
    // reset number of restore calls
    TestingStreamOperator.numberRestoreCalls = 0;
    testHarness.invoke(env);
    testHarness.waitForTaskRunning(deadline.timeLeft().toMillis());
    CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, checkpointTimestamp);
    while (!streamTask.triggerCheckpoint(checkpointMetaData, CheckpointOptions.forFullCheckpoint())) ;
    // since no state was set, there shouldn't be restore calls
    assertEquals(0, TestingStreamOperator.numberRestoreCalls);
    env.getCheckpointLatch().await();
    assertEquals(checkpointId, env.getCheckpointId());
    testHarness.endInput();
    testHarness.waitForTaskCompletion(deadline.timeLeft().toMillis());
    final OneInputStreamTask<String, String> restoredTask = new OneInputStreamTask<String, String>();
    restoredTask.setInitialState(new TaskStateHandles(env.getCheckpointStateHandles()));
    final OneInputStreamTaskTestHarness<String, String> restoredTaskHarness = new OneInputStreamTaskTestHarness<String, String>(restoredTask, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
    restoredTaskHarness.configureForKeyedStream(keySelector, BasicTypeInfo.STRING_TYPE_INFO);
    StreamConfig restoredTaskStreamConfig = restoredTaskHarness.getStreamConfig();
    configureChainedTestingStreamOperator(restoredTaskStreamConfig, numberChainedTasks, seed, recoveryTimestamp);
    TestingStreamOperator.numberRestoreCalls = 0;
    restoredTaskHarness.invoke();
    restoredTaskHarness.endInput();
    restoredTaskHarness.waitForTaskCompletion(deadline.timeLeft().toMillis());
    // restore of every chained operator should have been called
    assertEquals(numberChainedTasks, TestingStreamOperator.numberRestoreCalls);
    TestingStreamOperator.numberRestoreCalls = 0;
}
Also used : Deadline(scala.concurrent.duration.Deadline) FiniteDuration(scala.concurrent.duration.FiniteDuration) StreamConfig(org.apache.flink.streaming.api.graph.StreamConfig) CheckpointMetaData(org.apache.flink.runtime.checkpoint.CheckpointMetaData) TaskStateHandles(org.apache.flink.runtime.state.TaskStateHandles) MockInputSplitProvider(org.apache.flink.runtime.operators.testutils.MockInputSplitProvider) Test(org.junit.Test)

Example 8 with Deadline

use of scala.concurrent.duration.Deadline in project flink by apache.

the class JobManagerHACheckpointRecoveryITCase method testCheckpointedStreamingSumProgram.

/**
	 * Simple checkpointed streaming sum.
	 *
	 * <p>The sources (Parallelism) count until sequenceEnd. The sink (1) sums up all counts and
	 * returns it to the main thread via a static variable. We wait until some checkpoints are
	 * completed and sanity check that the sources recover with an updated state to make sure that
	 * this test actually tests something.
	 */
@Test
@RetryOnFailure(times = 1)
public void testCheckpointedStreamingSumProgram() throws Exception {
    // Config
    final int checkpointingInterval = 200;
    final int sequenceEnd = 5000;
    final long expectedSum = Parallelism * sequenceEnd * (sequenceEnd + 1) / 2;
    final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
    env.setParallelism(Parallelism);
    env.enableCheckpointing(checkpointingInterval);
    env.addSource(new CheckpointedSequenceSource(sequenceEnd)).addSink(new CountingSink()).setParallelism(1);
    JobGraph jobGraph = env.getStreamGraph().getJobGraph();
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getAbsoluteFile().toURI().toString());
    config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, Parallelism);
    ActorSystem testSystem = null;
    final JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
    LeaderRetrievalService leaderRetrievalService = null;
    ActorSystem taskManagerSystem = null;
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Test actor system
        testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
        // The job managers
        jobManagerProcess[0] = new JobManagerProcess(0, config);
        jobManagerProcess[1] = new JobManagerProcess(1, config);
        jobManagerProcess[0].startProcess();
        jobManagerProcess[1].startProcess();
        // Leader listener
        TestingListener leaderListener = new TestingListener();
        leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
        leaderRetrievalService.start(leaderListener);
        // The task manager
        taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
        TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
        {
            // Initial submission
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            // Get the leader ref
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            // Submit the job in detached mode
            leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Who's the boss?
        JobManagerProcess leadingJobManagerProcess;
        if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
            leadingJobManagerProcess = jobManagerProcess[0];
        } else {
            leadingJobManagerProcess = jobManagerProcess[1];
        }
        CompletedCheckpointsLatch.await();
        // Kill the leading job manager process
        leadingJobManagerProcess.destroy();
        {
            // Recovery by the standby JobManager
            leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
            String leaderAddress = leaderListener.getAddress();
            UUID leaderId = leaderListener.getLeaderSessionID();
            ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
            ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
            JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
        }
        // Wait to finish
        FinalCountLatch.await();
        assertEquals(expectedSum, (long) FinalCount.get());
        for (int i = 0; i < Parallelism; i++) {
            assertNotEquals(0, RecoveredStates.get(i));
        }
    } catch (Throwable t) {
        // Reset all static state for test retries
        CompletedCheckpointsLatch = new CountDownLatch(2);
        RecoveredStates = new AtomicLongArray(Parallelism);
        FinalCountLatch = new CountDownLatch(1);
        FinalCount = new AtomicReference<>();
        LastElement = -1;
        // Print early (in some situations the process logs get too big
        // for Travis and the root problem is not shown)
        t.printStackTrace();
        // In case of an error, print the job manager process logs.
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].printProcessLog();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].printProcessLog();
        }
        throw t;
    } finally {
        if (jobManagerProcess[0] != null) {
            jobManagerProcess[0].destroy();
        }
        if (jobManagerProcess[1] != null) {
            jobManagerProcess[1].destroy();
        }
        if (leaderRetrievalService != null) {
            leaderRetrievalService.stop();
        }
        if (taskManagerSystem != null) {
            taskManagerSystem.shutdown();
        }
        if (testSystem != null) {
            testSystem.shutdown();
        }
    }
}
Also used : ActorSystem(akka.actor.ActorSystem) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestingListener(org.apache.flink.runtime.leaderelection.TestingListener) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) UUID(java.util.UUID) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Deadline(scala.concurrent.duration.Deadline) AtomicReference(java.util.concurrent.atomic.AtomicReference) CountDownLatch(java.util.concurrent.CountDownLatch) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) Some(scala.Some) LeaderRetrievalService(org.apache.flink.runtime.leaderretrieval.LeaderRetrievalService) JobManagerProcess(org.apache.flink.runtime.testutils.JobManagerProcess) AtomicLongArray(java.util.concurrent.atomic.AtomicLongArray) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) Test(org.junit.Test) RetryOnFailure(org.apache.flink.testutils.junit.RetryOnFailure)

Example 9 with Deadline

use of scala.concurrent.duration.Deadline in project flink by apache.

the class JobManagerHAJobGraphRecoveryITCase method testJobPersistencyWhenJobManagerShutdown.

// ---------------------------------------------------------------------------------------------
/**
	 * Tests that the HA job is not cleaned up when the jobmanager is stopped.
	 */
@Test
public void testJobPersistencyWhenJobManagerShutdown() throws Exception {
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
    // Configure the cluster
    config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 1);
    config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
    TestingCluster flink = new TestingCluster(config, false, false);
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Start the JobManager and TaskManager
        flink.start(true);
        JobGraph jobGraph = createBlockingJobGraph();
        // Set restart strategy to guard against shut down races.
        // If the TM fails before the JM, it might happen that the
        // Job is failed, leading to state removal.
        ExecutionConfig ec = new ExecutionConfig();
        ec.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 100));
        jobGraph.setExecutionConfig(ec);
        ActorGateway jobManager = flink.getLeaderGateway(deadline.timeLeft());
        // Submit the job
        jobManager.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
        // Wait for the job to start
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, jobManager, deadline.timeLeft());
    } finally {
        flink.shutdown();
    }
    // verify that the persisted job data has not been removed from ZooKeeper when the JM has
    // been shutdown
    verifyRecoveryState(config);
}
Also used : JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) SubmittedJobGraph(org.apache.flink.runtime.jobmanager.SubmittedJobGraph) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) Configuration(org.apache.flink.configuration.Configuration) Deadline(scala.concurrent.duration.Deadline) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) ExecutionConfig(org.apache.flink.api.common.ExecutionConfig) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Test(org.junit.Test)

Example 10 with Deadline

use of scala.concurrent.duration.Deadline in project flink by apache.

the class JobManagerHAJobGraphRecoveryITCase method testSubmitJobToNonLeader.

/**
	 * Tests that submissions to non-leaders are handled.
	 */
@Test
public void testSubmitJobToNonLeader() throws Exception {
    Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
    // Configure the cluster
    config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
    config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
    TestingCluster flink = new TestingCluster(config, false, false);
    try {
        final Deadline deadline = TestTimeOut.fromNow();
        // Start the JobManager and TaskManager
        flink.start(true);
        JobGraph jobGraph = createBlockingJobGraph();
        List<ActorRef> bothJobManagers = flink.getJobManagersAsJava();
        ActorGateway leadingJobManager = flink.getLeaderGateway(deadline.timeLeft());
        ActorGateway nonLeadingJobManager;
        if (bothJobManagers.get(0).equals(leadingJobManager.actor())) {
            nonLeadingJobManager = new AkkaActorGateway(bothJobManagers.get(1), null);
        } else {
            nonLeadingJobManager = new AkkaActorGateway(bothJobManagers.get(0), null);
        }
        log.info("Leading job manager: " + leadingJobManager);
        log.info("Non-leading job manager: " + nonLeadingJobManager);
        // Submit the job
        nonLeadingJobManager.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
        log.info("Submitted job graph to " + nonLeadingJobManager);
        // Wait for the job to start. We are asking the *leading** JM here although we've
        // submitted the job to the non-leading JM. This is the behaviour under test.
        JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leadingJobManager, deadline.timeLeft());
        log.info("Wait that the non-leader removes the submitted job.");
        // Make sure that the **non-leading** JM has actually removed the job graph from its
        // local state.
        boolean success = false;
        while (!success && deadline.hasTimeLeft()) {
            JobStatusResponse jobStatusResponse = JobManagerActorTestUtils.requestJobStatus(jobGraph.getJobID(), nonLeadingJobManager, deadline.timeLeft());
            if (jobStatusResponse instanceof JobManagerMessages.JobNotFound) {
                success = true;
            } else {
                log.info(((JobManagerMessages.CurrentJobStatus) jobStatusResponse).status().toString());
                Thread.sleep(100);
            }
        }
        if (!success) {
            fail("Non-leading JM was still holding reference to the job graph.");
        }
        Future<Object> jobRemoved = leadingJobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
        leadingJobManager.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
        Await.ready(jobRemoved, deadline.timeLeft());
    } finally {
        flink.shutdown();
    }
    // Verify that everything is clean
    verifyCleanRecoveryState(config);
}
Also used : AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) JobStatusResponse(org.apache.flink.runtime.messages.JobManagerMessages.JobStatusResponse) Configuration(org.apache.flink.configuration.Configuration) ActorRef(akka.actor.ActorRef) TestActorRef(akka.testkit.TestActorRef) Deadline(scala.concurrent.duration.Deadline) JobManagerMessages(org.apache.flink.runtime.messages.JobManagerMessages) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) JobGraph(org.apache.flink.runtime.jobgraph.JobGraph) SubmittedJobGraph(org.apache.flink.runtime.jobmanager.SubmittedJobGraph) TestingCluster(org.apache.flink.runtime.testingUtils.TestingCluster) TestingJobManagerMessages(org.apache.flink.runtime.testingUtils.TestingJobManagerMessages) ActorGateway(org.apache.flink.runtime.instance.ActorGateway) AkkaActorGateway(org.apache.flink.runtime.instance.AkkaActorGateway) SubmitJob(org.apache.flink.runtime.messages.JobManagerMessages.SubmitJob) Test(org.junit.Test)

Aggregations

Deadline (scala.concurrent.duration.Deadline)59 Test (org.junit.Test)50 JobGraph (org.apache.flink.runtime.jobgraph.JobGraph)31 Configuration (org.apache.flink.configuration.Configuration)28 FiniteDuration (scala.concurrent.duration.FiniteDuration)24 JobID (org.apache.flink.api.common.JobID)21 ActorGateway (org.apache.flink.runtime.instance.ActorGateway)20 ActorRef (akka.actor.ActorRef)12 File (java.io.File)12 JobVertex (org.apache.flink.runtime.jobgraph.JobVertex)12 TestingCluster (org.apache.flink.runtime.testingUtils.TestingCluster)12 Tuple2 (org.apache.flink.api.java.tuple.Tuple2)10 AkkaActorGateway (org.apache.flink.runtime.instance.AkkaActorGateway)10 StreamExecutionEnvironment (org.apache.flink.streaming.api.environment.StreamExecutionEnvironment)10 ActorSystem (akka.actor.ActorSystem)9 ArrayList (java.util.ArrayList)9 JobManagerMessages (org.apache.flink.runtime.messages.JobManagerMessages)9 UUID (java.util.UUID)8 AtomicLong (java.util.concurrent.atomic.AtomicLong)8 KeySelector (org.apache.flink.api.java.functions.KeySelector)8