use of scala.concurrent.duration.Deadline in project flink by apache.
the class ZooKeeperLeaderElectionTest method testZooKeeperReelection.
/**
* Tests repeatedly the reelection of still available LeaderContender. After a contender has
* been elected as the leader, it is removed. This forces the ZooKeeperLeaderElectionService
* to elect a new leader.
*/
@Test
public void testZooKeeperReelection() throws Exception {
Configuration configuration = new Configuration();
configuration.setString(HighAvailabilityOptions.HA_ZOOKEEPER_QUORUM, testingServer.getConnectString());
configuration.setString(HighAvailabilityOptions.HA_MODE, "zookeeper");
Deadline deadline = new FiniteDuration(5, TimeUnit.MINUTES).fromNow();
int num = 20;
ZooKeeperLeaderElectionService[] leaderElectionService = new ZooKeeperLeaderElectionService[num];
TestingContender[] contenders = new TestingContender[num];
ZooKeeperLeaderRetrievalService leaderRetrievalService = null;
TestingListener listener = new TestingListener();
try {
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(configuration);
LOG.debug("Start leader retrieval service for the TestingListener.");
leaderRetrievalService.start(listener);
for (int i = 0; i < num; i++) {
leaderElectionService[i] = ZooKeeperUtils.createLeaderElectionService(configuration);
contenders[i] = new TestingContender(TEST_URL + "_" + i, leaderElectionService[i]);
LOG.debug("Start leader election service for contender #{}.", i);
leaderElectionService[i].start(contenders[i]);
}
String pattern = TEST_URL + "_" + "(\\d+)";
Pattern regex = Pattern.compile(pattern);
int numberSeenLeaders = 0;
while (deadline.hasTimeLeft() && numberSeenLeaders < num) {
LOG.debug("Wait for new leader #{}.", numberSeenLeaders);
String address = listener.waitForNewLeader(deadline.timeLeft().toMillis());
Matcher m = regex.matcher(address);
if (m.find()) {
int index = Integer.parseInt(m.group(1));
TestingContender contender = contenders[index];
// check that the retrieval service has retrieved the correct leader
if (address.equals(contender.getAddress()) && listener.getLeaderSessionID().equals(contender.getLeaderSessionID())) {
// kill the election service of the leader
LOG.debug("Stop leader election service of contender #{}.", numberSeenLeaders);
leaderElectionService[index].stop();
leaderElectionService[index] = null;
numberSeenLeaders++;
}
} else {
fail("Did not find the leader's index.");
}
}
assertFalse(deadline.isOverdue());
assertEquals(num, numberSeenLeaders);
} finally {
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
for (ZooKeeperLeaderElectionService electionService : leaderElectionService) {
if (electionService != null) {
electionService.stop();
}
}
}
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class OneInputStreamTaskTest method testSnapshottingAndRestoring.
/**
* Tests that the stream operator can snapshot and restore the operator state of chained
* operators
*/
@Test
public void testSnapshottingAndRestoring() throws Exception {
final Deadline deadline = new FiniteDuration(2, TimeUnit.MINUTES).fromNow();
final OneInputStreamTask<String, String> streamTask = new OneInputStreamTask<String, String>();
final OneInputStreamTaskTestHarness<String, String> testHarness = new OneInputStreamTaskTestHarness<String, String>(streamTask, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
testHarness.setupOutputForSingletonOperatorChain();
IdentityKeySelector<String> keySelector = new IdentityKeySelector<>();
testHarness.configureForKeyedStream(keySelector, BasicTypeInfo.STRING_TYPE_INFO);
long checkpointId = 1L;
long checkpointTimestamp = 1L;
long recoveryTimestamp = 3L;
long seed = 2L;
int numberChainedTasks = 11;
StreamConfig streamConfig = testHarness.getStreamConfig();
configureChainedTestingStreamOperator(streamConfig, numberChainedTasks, seed, recoveryTimestamp);
AcknowledgeStreamMockEnvironment env = new AcknowledgeStreamMockEnvironment(testHarness.jobConfig, testHarness.taskConfig, testHarness.executionConfig, testHarness.memorySize, new MockInputSplitProvider(), testHarness.bufferSize);
// reset number of restore calls
TestingStreamOperator.numberRestoreCalls = 0;
testHarness.invoke(env);
testHarness.waitForTaskRunning(deadline.timeLeft().toMillis());
CheckpointMetaData checkpointMetaData = new CheckpointMetaData(checkpointId, checkpointTimestamp);
while (!streamTask.triggerCheckpoint(checkpointMetaData, CheckpointOptions.forFullCheckpoint())) ;
// since no state was set, there shouldn't be restore calls
assertEquals(0, TestingStreamOperator.numberRestoreCalls);
env.getCheckpointLatch().await();
assertEquals(checkpointId, env.getCheckpointId());
testHarness.endInput();
testHarness.waitForTaskCompletion(deadline.timeLeft().toMillis());
final OneInputStreamTask<String, String> restoredTask = new OneInputStreamTask<String, String>();
restoredTask.setInitialState(new TaskStateHandles(env.getCheckpointStateHandles()));
final OneInputStreamTaskTestHarness<String, String> restoredTaskHarness = new OneInputStreamTaskTestHarness<String, String>(restoredTask, BasicTypeInfo.STRING_TYPE_INFO, BasicTypeInfo.STRING_TYPE_INFO);
restoredTaskHarness.configureForKeyedStream(keySelector, BasicTypeInfo.STRING_TYPE_INFO);
StreamConfig restoredTaskStreamConfig = restoredTaskHarness.getStreamConfig();
configureChainedTestingStreamOperator(restoredTaskStreamConfig, numberChainedTasks, seed, recoveryTimestamp);
TestingStreamOperator.numberRestoreCalls = 0;
restoredTaskHarness.invoke();
restoredTaskHarness.endInput();
restoredTaskHarness.waitForTaskCompletion(deadline.timeLeft().toMillis());
// restore of every chained operator should have been called
assertEquals(numberChainedTasks, TestingStreamOperator.numberRestoreCalls);
TestingStreamOperator.numberRestoreCalls = 0;
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class JobManagerHACheckpointRecoveryITCase method testCheckpointedStreamingSumProgram.
/**
* Simple checkpointed streaming sum.
*
* <p>The sources (Parallelism) count until sequenceEnd. The sink (1) sums up all counts and
* returns it to the main thread via a static variable. We wait until some checkpoints are
* completed and sanity check that the sources recover with an updated state to make sure that
* this test actually tests something.
*/
@Test
@RetryOnFailure(times = 1)
public void testCheckpointedStreamingSumProgram() throws Exception {
// Config
final int checkpointingInterval = 200;
final int sequenceEnd = 5000;
final long expectedSum = Parallelism * sequenceEnd * (sequenceEnd + 1) / 2;
final StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
env.setParallelism(Parallelism);
env.enableCheckpointing(checkpointingInterval);
env.addSource(new CheckpointedSequenceSource(sequenceEnd)).addSink(new CountingSink()).setParallelism(1);
JobGraph jobGraph = env.getStreamGraph().getJobGraph();
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getAbsoluteFile().toURI().toString());
config.setInteger(ConfigConstants.TASK_MANAGER_NUM_TASK_SLOTS, Parallelism);
ActorSystem testSystem = null;
final JobManagerProcess[] jobManagerProcess = new JobManagerProcess[2];
LeaderRetrievalService leaderRetrievalService = null;
ActorSystem taskManagerSystem = null;
try {
final Deadline deadline = TestTimeOut.fromNow();
// Test actor system
testSystem = AkkaUtils.createActorSystem(new Configuration(), new Some<>(new Tuple2<String, Object>("localhost", 0)));
// The job managers
jobManagerProcess[0] = new JobManagerProcess(0, config);
jobManagerProcess[1] = new JobManagerProcess(1, config);
jobManagerProcess[0].startProcess();
jobManagerProcess[1].startProcess();
// Leader listener
TestingListener leaderListener = new TestingListener();
leaderRetrievalService = ZooKeeperUtils.createLeaderRetrievalService(config);
leaderRetrievalService.start(leaderListener);
// The task manager
taskManagerSystem = AkkaUtils.createActorSystem(config, Option.apply(new Tuple2<String, Object>("localhost", 0)));
TaskManager.startTaskManagerComponentsAndActor(config, ResourceID.generate(), taskManagerSystem, "localhost", Option.<String>empty(), Option.<LeaderRetrievalService>empty(), false, TaskManager.class);
{
// Initial submission
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
// Get the leader ref
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
// Submit the job in detached mode
leader.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Who's the boss?
JobManagerProcess leadingJobManagerProcess;
if (jobManagerProcess[0].getJobManagerAkkaURL(deadline.timeLeft()).equals(leaderListener.getAddress())) {
leadingJobManagerProcess = jobManagerProcess[0];
} else {
leadingJobManagerProcess = jobManagerProcess[1];
}
CompletedCheckpointsLatch.await();
// Kill the leading job manager process
leadingJobManagerProcess.destroy();
{
// Recovery by the standby JobManager
leaderListener.waitForNewLeader(deadline.timeLeft().toMillis());
String leaderAddress = leaderListener.getAddress();
UUID leaderId = leaderListener.getLeaderSessionID();
ActorRef leaderRef = AkkaUtils.getActorRef(leaderAddress, testSystem, deadline.timeLeft());
ActorGateway leader = new AkkaActorGateway(leaderRef, leaderId);
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leader, deadline.timeLeft());
}
// Wait to finish
FinalCountLatch.await();
assertEquals(expectedSum, (long) FinalCount.get());
for (int i = 0; i < Parallelism; i++) {
assertNotEquals(0, RecoveredStates.get(i));
}
} catch (Throwable t) {
// Reset all static state for test retries
CompletedCheckpointsLatch = new CountDownLatch(2);
RecoveredStates = new AtomicLongArray(Parallelism);
FinalCountLatch = new CountDownLatch(1);
FinalCount = new AtomicReference<>();
LastElement = -1;
// Print early (in some situations the process logs get too big
// for Travis and the root problem is not shown)
t.printStackTrace();
// In case of an error, print the job manager process logs.
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].printProcessLog();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].printProcessLog();
}
throw t;
} finally {
if (jobManagerProcess[0] != null) {
jobManagerProcess[0].destroy();
}
if (jobManagerProcess[1] != null) {
jobManagerProcess[1].destroy();
}
if (leaderRetrievalService != null) {
leaderRetrievalService.stop();
}
if (taskManagerSystem != null) {
taskManagerSystem.shutdown();
}
if (testSystem != null) {
testSystem.shutdown();
}
}
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class JobManagerHAJobGraphRecoveryITCase method testJobPersistencyWhenJobManagerShutdown.
// ---------------------------------------------------------------------------------------------
/**
* Tests that the HA job is not cleaned up when the jobmanager is stopped.
*/
@Test
public void testJobPersistencyWhenJobManagerShutdown() throws Exception {
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Configure the cluster
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 1);
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
TestingCluster flink = new TestingCluster(config, false, false);
try {
final Deadline deadline = TestTimeOut.fromNow();
// Start the JobManager and TaskManager
flink.start(true);
JobGraph jobGraph = createBlockingJobGraph();
// Set restart strategy to guard against shut down races.
// If the TM fails before the JM, it might happen that the
// Job is failed, leading to state removal.
ExecutionConfig ec = new ExecutionConfig();
ec.setRestartStrategy(RestartStrategies.fixedDelayRestart(Integer.MAX_VALUE, 100));
jobGraph.setExecutionConfig(ec);
ActorGateway jobManager = flink.getLeaderGateway(deadline.timeLeft());
// Submit the job
jobManager.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
// Wait for the job to start
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, jobManager, deadline.timeLeft());
} finally {
flink.shutdown();
}
// verify that the persisted job data has not been removed from ZooKeeper when the JM has
// been shutdown
verifyRecoveryState(config);
}
use of scala.concurrent.duration.Deadline in project flink by apache.
the class JobManagerHAJobGraphRecoveryITCase method testSubmitJobToNonLeader.
/**
* Tests that submissions to non-leaders are handled.
*/
@Test
public void testSubmitJobToNonLeader() throws Exception {
Configuration config = ZooKeeperTestUtils.createZooKeeperHAConfig(ZooKeeper.getConnectString(), FileStateBackendBasePath.getPath());
// Configure the cluster
config.setInteger(ConfigConstants.LOCAL_NUMBER_JOB_MANAGER, 2);
config.setInteger(ConfigConstants.LOCAL_NUMBER_TASK_MANAGER, 1);
TestingCluster flink = new TestingCluster(config, false, false);
try {
final Deadline deadline = TestTimeOut.fromNow();
// Start the JobManager and TaskManager
flink.start(true);
JobGraph jobGraph = createBlockingJobGraph();
List<ActorRef> bothJobManagers = flink.getJobManagersAsJava();
ActorGateway leadingJobManager = flink.getLeaderGateway(deadline.timeLeft());
ActorGateway nonLeadingJobManager;
if (bothJobManagers.get(0).equals(leadingJobManager.actor())) {
nonLeadingJobManager = new AkkaActorGateway(bothJobManagers.get(1), null);
} else {
nonLeadingJobManager = new AkkaActorGateway(bothJobManagers.get(0), null);
}
log.info("Leading job manager: " + leadingJobManager);
log.info("Non-leading job manager: " + nonLeadingJobManager);
// Submit the job
nonLeadingJobManager.tell(new SubmitJob(jobGraph, ListeningBehaviour.DETACHED));
log.info("Submitted job graph to " + nonLeadingJobManager);
// Wait for the job to start. We are asking the *leading** JM here although we've
// submitted the job to the non-leading JM. This is the behaviour under test.
JobManagerActorTestUtils.waitForJobStatus(jobGraph.getJobID(), JobStatus.RUNNING, leadingJobManager, deadline.timeLeft());
log.info("Wait that the non-leader removes the submitted job.");
// Make sure that the **non-leading** JM has actually removed the job graph from its
// local state.
boolean success = false;
while (!success && deadline.hasTimeLeft()) {
JobStatusResponse jobStatusResponse = JobManagerActorTestUtils.requestJobStatus(jobGraph.getJobID(), nonLeadingJobManager, deadline.timeLeft());
if (jobStatusResponse instanceof JobManagerMessages.JobNotFound) {
success = true;
} else {
log.info(((JobManagerMessages.CurrentJobStatus) jobStatusResponse).status().toString());
Thread.sleep(100);
}
}
if (!success) {
fail("Non-leading JM was still holding reference to the job graph.");
}
Future<Object> jobRemoved = leadingJobManager.ask(new TestingJobManagerMessages.NotifyWhenJobRemoved(jobGraph.getJobID()), deadline.timeLeft());
leadingJobManager.tell(new JobManagerMessages.CancelJob(jobGraph.getJobID()));
Await.ready(jobRemoved, deadline.timeLeft());
} finally {
flink.shutdown();
}
// Verify that everything is clean
verifyCleanRecoveryState(config);
}
Aggregations